# Import libraries

In [1]:
# General
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.filterwarnings("ignore")

# Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier

# Regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,ElasticNet,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# Modelling Helpers:
from sklearn.preprocessing import Imputer, Normalizer, scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, ShuffleSplit, cross_validate

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer, LabelEncoder

# Evaluation metrics for Regression 
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
# Evaluation metrics for Classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Setup complete...")

Setup complete...


# Load data

In [2]:
data = pd.read_csv('Toddler Autism dataset July 2018.csv')
data.shape

(1054, 19)

# Data Preparation

In [3]:
data.columns

Index(['Case_No', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10',
       'Age_Mons', 'Qchat-10-Score', 'Sex', 'Ethnicity', 'Jaundice',
       'Family_mem_with_ASD', 'Who completed the test', 'Class/ASD Traits '],
      dtype='object')

### Remove unnesscary columns

In [4]:
data.drop(['Case_No','Who completed the test','Qchat-10-Score'],axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,f,middle eastern,yes,no,No
1,1,1,0,0,0,1,1,0,0,0,36,m,White European,yes,no,Yes
2,1,0,0,0,0,0,1,1,0,1,36,m,middle eastern,yes,no,Yes
3,1,1,1,1,1,1,1,1,1,1,24,m,Hispanic,no,no,Yes
4,1,1,0,1,1,1,1,1,1,1,20,f,White European,no,yes,Yes


### One hot coding

In [6]:
le = LabelEncoder()
columns = ['Ethnicity','Family_mem_with_ASD','Class/ASD Traits ','Sex','Jaundice']
for col in columns:
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,0,8,1,0,0
1,1,1,0,0,0,1,1,0,0,0,36,1,5,1,0,1
2,1,0,0,0,0,0,1,1,0,1,36,1,8,1,0,1
3,1,1,1,1,1,1,1,1,1,1,24,1,0,0,0,1
4,1,1,0,1,1,1,1,1,1,1,20,0,5,0,1,1


### Split data into X & Y

In [7]:
X = data.drop(['Class/ASD Traits '], axis = 1)
Y = data['Class/ASD Traits ']

### Split data into train - test set

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 7)

# Logistic Regression

### Initialize models

In [9]:
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Evaluation metrics

In [19]:
logmodel.score(x_train, y_train)

0.9893238434163701

In [20]:
logmodel.score(x_test, y_test)

0.9715639810426541

### Creating Summary Table

#### Find the intercept and coefficients

In [21]:
logmodel.intercept_

array([-3.37139183])

In [22]:
logmodel.coef_

array([[ 1.94256662,  2.61569425,  1.75426844,  2.01204493,  1.81842255,
         2.14192351,  2.02282381,  2.52951468,  2.7222153 ,  1.78158316,
        -0.0561083 , -0.12934953, -0.32711568,  0.31666977, -0.10667457]])

In [26]:
feature_name = X.columns.values
feature_name

array(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10',
       'Age_Mons', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD'],
      dtype=object)

In [27]:
summary_table = pd.DataFrame(columns=['feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(logmodel.coef_)

In [28]:
summary_table

Unnamed: 0,feature_name,Coefficient
0,A1,1.942567
1,A2,2.615694
2,A3,1.754268
3,A4,2.012045
4,A5,1.818423
5,A6,2.141924
6,A7,2.022824
7,A8,2.529515
8,A9,2.722215
9,A10,1.781583


In [30]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', logmodel.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,Coefficient
0,Intercept,-3.371392
2,A1,1.942567
3,A2,2.615694
4,A3,1.754268
5,A4,2.012045
6,A5,1.818423
7,A6,2.141924
8,A7,2.022824
9,A8,2.529515
10,A9,2.722215


In [31]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,feature_name,Coefficient,Odds_ratio
0,Intercept,-3.371392,0.034342
2,A1,1.942567,6.976634
3,A2,2.615694,13.676708
4,A3,1.754268,5.779218
5,A4,2.012045,7.478595
6,A5,1.818423,6.16213
7,A6,2.141924,8.515802
8,A7,2.022824,7.559642
9,A8,2.529515,12.547415
10,A9,2.722215,15.213988


In [32]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,feature_name,Coefficient,Odds_ratio
10,A9,2.722215,15.213988
3,A2,2.615694,13.676708
9,A8,2.529515,12.547415
7,A6,2.141924,8.515802
8,A7,2.022824,7.559642
5,A4,2.012045,7.478595
2,A1,1.942567,6.976634
6,A5,1.818423,6.16213
11,A10,1.781583,5.939252
4,A3,1.754268,5.779218


In [38]:
pred_log = logmodel.predict(x_test)

In [39]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, pred_log))
print(classification_report(y_test, pred_log))

[[ 57   5]
 [  1 148]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95        62
           1       0.97      0.99      0.98       149

   micro avg       0.97      0.97      0.97       211
   macro avg       0.98      0.96      0.97       211
weighted avg       0.97      0.97      0.97       211



### Interpreting Coefficients 

### Backward elimination

The idea is that we can simplify our model by removing all features which have close to no contribution to the model. When we have the p-values, we get rid of all coefficients with p-values > 0.05

### Testing the model

In [34]:
predicted_proba = logmodel.predict_proba(x_test)
predicted_proba

array([[2.93222281e-01, 7.06777719e-01],
       [9.92460518e-01, 7.53948164e-03],
       [9.89573445e-01, 1.04265545e-02],
       [3.64689509e-05, 9.99963531e-01],
       [3.75946866e-01, 6.24053134e-01],
       [2.04580953e-07, 9.99999795e-01],
       [1.57420117e-03, 9.98425799e-01],
       [5.47060833e-05, 9.99945294e-01],
       [8.83244223e-04, 9.99116756e-01],
       [3.96141738e-06, 9.99996039e-01],
       [3.72872158e-07, 9.99999627e-01],
       [1.18057149e-06, 9.99998819e-01],
       [5.54572594e-01, 4.45427406e-01],
       [2.42076561e-03, 9.97579234e-01],
       [5.01685785e-04, 9.99498314e-01],
       [5.67320127e-05, 9.99943268e-01],
       [5.30259179e-01, 4.69740821e-01],
       [9.14243204e-05, 9.99908576e-01],
       [6.62455443e-04, 9.99337545e-01],
       [4.25795145e-05, 9.99957420e-01],
       [6.98459283e-05, 9.99930154e-01],
       [8.38257252e-01, 1.61742748e-01],
       [9.95174353e-01, 4.82564668e-03],
       [2.69097429e-03, 9.97309026e-01],
       [5.138433

In [36]:
predicted_proba.shape

(211, 2)

In [37]:
predicted_proba[:,1]

array([7.06777719e-01, 7.53948164e-03, 1.04265545e-02, 9.99963531e-01,
       6.24053134e-01, 9.99999795e-01, 9.98425799e-01, 9.99945294e-01,
       9.99116756e-01, 9.99996039e-01, 9.99999627e-01, 9.99998819e-01,
       4.45427406e-01, 9.97579234e-01, 9.99498314e-01, 9.99943268e-01,
       4.69740821e-01, 9.99908576e-01, 9.99337545e-01, 9.99957420e-01,
       9.99930154e-01, 1.61742748e-01, 4.82564668e-03, 9.97309026e-01,
       4.86156619e-01, 9.92135011e-01, 4.19997611e-01, 9.99998670e-01,
       9.99949354e-01, 9.95197438e-01, 3.51413206e-03, 9.99999578e-01,
       9.95705038e-01, 1.24272125e-01, 9.88149623e-01, 8.48935463e-01,
       4.11730268e-03, 9.84072894e-01, 9.98286085e-01, 6.63075786e-03,
       9.99871279e-01, 4.50093044e-03, 1.17067576e-01, 9.23954555e-01,
       6.69200322e-01, 9.79188355e-01, 9.97291160e-01, 1.55205934e-01,
       8.48134787e-01, 6.25872136e-02, 9.99824574e-01, 9.98531316e-01,
       6.50044234e-03, 3.36038616e-02, 1.39316662e-02, 9.66577765e-01,
      

### Tuning parameters for LR

For better parameters we will apply GridSearch

In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.01,0.1,1,10,100,1000]}

In [11]:
grid_log = GridSearchCV(LogisticRegression(), param_grid, refit=True)

### Train - Test Split

In [12]:
grid_log.fit(x_train, y_train)
grid_log.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
pred_log = grid_log.predict(x_test)

### Evaluation Metrics

In [15]:
grid_log.fit(x_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, pred_log))
print(classification_report(y_test, pred_log))

[[ 62   0]
 [  0 149]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00       149

   micro avg       1.00      1.00      1.00       211
   macro avg       1.00      1.00      1.00       211
weighted avg       1.00      1.00      1.00       211

