## Logistic regression with feature scaling

In [8]:
import os
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib   import Path

from sklearn.preprocessing   import StandardScaler
from sklearn.linear_model    import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics         import accuracy_score

In [2]:
# set root directory
path_root = Path("C:/Users/giann/data-science-core")
os.chdir(path_root)
print(f'- Root directory = {os.getcwd()}')

- Root directory = C:\Users\giann\data-science-core


In [3]:
# global setting
plt.style.use('ggplot')

### Fit with feature scaling: option 1
Normalize the whole dataset **before** split the data.

In [50]:
# import dataset
path_dataset = path_root / 'dataset/PimaIndians.csv'
data  = pd.read_csv(path_dataset)
data.head()

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive
2,3,78,50,32,88,31.0,0.248,26,positive
3,2,197,70,45,543,30.5,0.158,53,positive
4,1,189,60,23,846,30.1,0.398,59,positive


In [51]:
X = data.drop('test', axis = 1)
y = data['test']

In [52]:
# scale and split
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, stratify = y, random_state = 42)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [53]:
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [54]:
print("o {0:.2%} accuracy on test set.".format(accuracy_score(y_test, lr.predict(X_test))))
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

o 78.81% accuracy on test set.
{'pregnant': 0.23, 'glucose': 1.28, 'diastolic': 0.12, 'triceps': 0.15, 'insulin': 0.13, 'bmi': 0.47, 'family': 0.3, 'age': 0.31}


### Fit with feature scaling: option 2
Normalize the dataset **after** splitting the data.

In [43]:
# import dataset
path_dataset = path_root / 'dataset/PimaIndians.csv'
data  = pd.read_csv(path_dataset)
data.head()

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive
2,3,78,50,32,88,31.0,0.248,26,positive
3,2,197,70,45,543,30.5,0.158,53,positive
4,1,189,60,23,846,30.1,0.398,59,positive


In [44]:
X = data.drop('test', axis = 1)
y = data['test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)

In [45]:
scaler = StandardScaler()
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [46]:
# Fit the logistic regression model on the scaled training data
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(X_train_std, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [47]:
# Scale the test features
X_test_std = scaler.transform(X_test)
# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

  


In [48]:
# Prints accuracy metrics and feature coefficients
print("o {0:.2%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print('o Parameter coefficient')
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

o 78.81% accuracy on test set.
o Parameter coefficient
{'pregnant': 0.23, 'glucose': 1.26, 'diastolic': 0.12, 'triceps': 0.16, 'insulin': 0.13, 'bmi': 0.47, 'family': 0.29, 'age': 0.32}
