# Review of Supervised Learning with scikit-learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
sklearn.set_config(print_changed_only=True)

In [2]:
# read data.
# you can find a description in data/bank-campaign-desc.txt
data = pd.read_csv("data/bank-campaign.csv")

In [3]:
data.shape

(41188, 64)

In [4]:
data.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oc

In [5]:
data.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,target
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no


In [6]:
y = data.target

In [7]:
X = data.drop("target", axis=1)

In [8]:
X.shape

(41188, 63)

In [9]:
y.shape

(41188,)

In [10]:
y.head()

0    no
1    no
2    no
3    no
4    no
Name: target, dtype: object

In [11]:
data.target.value_counts()

no     36548
yes     4640
Name: target, dtype: int64

In [12]:
data.target.value_counts() / data.target.size

no     0.887346
yes    0.112654
Name: target, dtype: float64

Splitting the data:

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=42, stratify=y)

In [14]:
np.sum(y_train == "yes") / len(y_train)

0.11265553869499241

In [15]:
np.sum(y_test == "yes") / len(y_test)

0.11264870114105366

In [16]:
# import model
from sklearn.linear_model import LogisticRegression
# instantiate model, set parameters
lr = LogisticRegression(C=0.1, max_iter=100)
# fit model
lr.fit(X_train, y_train)
lr.coef_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([[-5.40292376e-04,  4.26901016e-03, -3.11315092e-02,
        -1.87782119e-03, -1.45652443e-02, -2.35287684e-01,
         2.58991638e-01,  5.94304356e-02, -2.56720414e-01,
        -4.47448102e-03,  2.88511186e-02, -5.46664218e-02,
        -5.45778387e-03, -1.79975858e-03,  2.89158973e-03,
         2.15824257e-02, -1.14810689e-03, -1.38316405e-02,
         1.54828303e-02,  7.56848325e-03,  2.86408926e-03,
         5.25011858e-04, -3.43496876e-03, -3.37612097e-02,
         3.99329259e-02,  1.25089448e-04, -1.06654573e-02,
        -7.46782067e-03, -2.84027859e-02, -1.04212466e-02,
         2.94610051e-04,  3.43509780e-03,  4.94009919e-02,
         6.68844769e-03,  5.14153804e-02, -4.85344306e-02,
        -1.91128394e-05, -9.91283246e-04,  7.75564256e-04,
         3.07755595e-03,  4.55293477e-03,  7.75564256e-04,
        -2.46666207e-03,  6.61981561e-02, -6.33363191e-02,
         1.42802333e-02,  1.78449925e-02,  3.24582682e-03,
         2.32065678e-02,  2.07523923e-02,  3.60371198e-0

Make predictions:

In [17]:
lr.score(X_train, y_train)

0.9076176024279211

In [18]:
(y_train == "no").mean()

0.8873444613050075

In [19]:
lr.score(X_test, y_test)

0.9098082058752124

# https://github.com/amueller/ml-workshop-2-of-4

# Exercise
Load the dataset ``data/bike_day_raw.csv``, which has the regression target ``cnt``.
This dataset is hourly bike rentals in the citybike platform. The ``cnt`` column is the number of rentals, which we want to predict from date and weather data.

Split the data into a training and a test set using ``train_test_split``.
Use the ``LinearRegression`` class to learn a regression model on this data. You can evaluate with the ``score`` method, which provides the $R^2$ or using the ``mean_squared_error`` function from ``sklearn.metrics`` (or write it yourself in numpy).

In [20]:
# %load solutions/bike_regression.py

In [25]:
data = pd.read_csv("data/bike_day_raw.csv")
X = data.drop("cnt", axis=1)
y = data.cnt



In [27]:

display(data.head())


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.linear_model import LinearRegression

# for other models you should scale here

lr = LinearRegression().fit(X_train, y_train)

print(lr.score(X_train, y_train))

print(lr.score(X_test, y_test))


from sklearn.metrics import mean_squared_error
y_pred = lr.predict(X_test)
print(mean_squared_error(y_test, y_pred))




Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


0.53289255294987
0.4991033756876272
2220717.208700471


In [31]:

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
ohe = make_column_transformer(
    (OneHotEncoder(sparse=False), X_train.columns[:6]),
    remainder='passthrough')

X_train_ohe = ohe.fit_transform(X_train)
X_test_ohe = ohe.transform(X_test)

X_train.shape

X_train_ohe.shape

(548, 34)

In [32]:

lr = LinearRegression().fit(X_train_ohe, y_train)

print(lr.score(X_train_ohe, y_train))

print(lr.score(X_test_ohe, y_test))

from sklearn.metrics import mean_squared_error
y_pred = lr.predict(X_test_ohe)




0.5995155452751685
0.5250552002620287
