In [112]:
import time
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


set_config(display='diagram')

In [42]:
census = pd.read_csv('CensusDataset.csv')

In [43]:
target = census['class']
data = census.drop(columns='class')
data.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States


In [44]:
# Numerical and Categorical columns
cat = census.dtypes == object
num = census.dtypes != object
cat_col = census.loc[:,cat].columns
num_col = census.loc[:,num].columns
print(f'Numerical columns:\n {num_col.values}\n')
print(f'Categorical columns:\n {cat_col.values}')

Numerical columns:
 ['age' 'fnlwgt' 'education-num' 'capital-gain' 'capital-loss'
 'hours-per-week']

Categorical columns:
 ['workclass' 'education' 'marital-status' 'occupation' 'relationship'
 'race' 'sex' 'native-country' 'class']


In [92]:
data_num = data[num_col]
data_num.drop(columns=['fnlwgt','education-num'], inplace=True)
data_num.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25,0,0,40
1,38,0,0,50
2,28,0,0,40
3,44,7688,0,40
4,18,0,0,30


In [93]:
X_train, X_test, y_train, y_test = train_test_split(data_num, target, random_state=42)

In [94]:
X_train.describe()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
count,36631.0,36631.0,36631.0,36631.0
mean,38.642352,1087.077721,89.665311,40.431247
std,13.725748,7522.692939,407.110175,12.423952
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [95]:
# Creating and fitting scaler
scaler = StandardScaler()
scaler.fit(X_train)

In [96]:
print(f'Calculating columns means:\n {scaler.mean_}')
print(f'Calculating columns standard deviations:\n {scaler.scale_}')

Calculating columns means:
 [  38.64235211 1087.07772106   89.6653108    40.43124676]
Calculating columns standard deviations:
 [  13.72556083 7522.59025606  407.10461772   12.42378265]


In [97]:
# Transforming data
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 0.17177061, -0.14450843,  5.71188483, -2.28845333],
       [ 0.02605707, -0.14450843, -0.22025127, -0.27618374],
       [-0.33822677, -0.14450843, -0.22025127,  0.77019645],
       ...,
       [-0.77536738, -0.14450843, -0.22025127, -0.03471139],
       [ 0.53605445, -0.14450843, -0.22025127, -0.03471139],
       [ 1.48319243, -0.14450843, -0.22025127, -2.69090725]])

In [98]:
# Or fit and transform could be applied with function scaler.fit_transform
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[ 0.17177061, -0.14450843,  5.71188483, -2.28845333],
       [ 0.02605707, -0.14450843, -0.22025127, -0.27618374],
       [-0.33822677, -0.14450843, -0.22025127,  0.77019645],
       ...,
       [-0.77536738, -0.14450843, -0.22025127, -0.03471139],
       [ 0.53605445, -0.14450843, -0.22025127, -0.03471139],
       [ 1.48319243, -0.14450843, -0.22025127, -2.69090725]])

In [99]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled.describe()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
count,36631.0,36631.0,36631.0,36631.0
mean,-2.273364e-16,3.5303100000000004e-17,3.8406670000000006e-17,1.844684e-16
std,1.000014,1.000014,1.000014,1.000014
min,-1.576792,-0.1445084,-0.2202513,-3.173852
25%,-0.7753674,-0.1445084,-0.2202513,-0.03471139
50%,-0.1196565,-0.1445084,-0.2202513,-0.03471139
75%,0.681768,-0.1445084,-0.2202513,0.3677425
max,3.741752,13.14865,10.4797,4.714245


In [106]:
model = make_pipeline(StandardScaler(), LogisticRegression())
model

In [107]:
model.named_steps

{'standardscaler': StandardScaler(),
 'logisticregression': LogisticRegression()}

In [108]:
start_time = time.time()
model.fit(X_train, y_train)
total_time = time.time() - start_time

In [109]:
pred = model.predict(X_test)
pred[:5]

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)

In [110]:
model_name = model.__class__.__name__
score = model.score(X_test, y_test)
print(f'Accuracy of {model_name} is {score:.3f},'
      f' fitting time {total_time} in {model[-1].n_iter_[0]} iterations')

Accuracy of Pipeline is 0.807, fitting time 0.08282756805419922 in 12 iterations


In [111]:
# Previous model without scaling
model = LogisticRegression()
start_time = time.time()
model.fit(X_train, y_train)
total_time = time.time() - start_time

model_name = model.__class__.__name__
score = model.score(X_test, y_test)

print(f'Accuracy of {model_name} is {score:.3f},'
      f' fitting time {total_time} in {model.n_iter_[0]} iterations')

Accuracy of LogisticRegression is 0.807, fitting time 0.1696021556854248 in 59 iterations


In [122]:
%%time
# Cross-validation
model = make_pipeline(StandardScaler(), LogisticRegression())
crossval = cross_validate(model, data_num, target, cv=5)
crossval

Wall time: 438 ms


{'fit_time': array([0.07814169, 0.06532836, 0.05415297, 0.06901288, 0.05849338]),
 'score_time': array([0.01252508, 0.01199913, 0.01562595, 0.01490831, 0.01001191]),
 'test_score': array([0.79557785, 0.80049135, 0.79965192, 0.79873055, 0.80436118])}

In [123]:
scores = crossval['test_score']
print('The mean cross-validation accuracy is: ' + 
      f'{scores.mean():.3f} +/- {scores.std():.3f}')

The mean cross-validation accuracy is: 0.800 +/- 0.003
