# From Data to Action: Machine Learning Approaches for Predicting Tobacco-Free Policy Implementation in Schools 
## Logistic Regression - High Correlation Features from Heat Map

Loading imputed dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
from autoviz.AutoViz_Class import AutoViz_Class
import statsmodels.api as sm
import scipy.stats as st

IMAGES_PATH = Path() / "plots"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
# Read the two dataframes
df = pd.read_csv('/main/tobaccoFree/data/imputed_data.csv')


df.head()

Heat map for knn imputed data

In [None]:
plt.figure(figsize=(80,60))
sns.heatmap(abs(df.corr(numeric_only=True)), annot=True, annot_kws={"size": 6}, cmap="YlGnBu")
save_fig('Heat Map_imputed')

Filtering to only use the top 3 correlated variables chosen for model recreation

In [None]:
#df_filter = df.filter(["Dist", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "totalCriteria", "percCriteria", "rationYAAY", "EngMedium_Eng", "EngMedium_Marathi", "SchoolInfra_Pucca", "localTrust_active", "ruleFollow_proactive", "remark_active", "prcplIfChild", "prcplSpouseWork", "prcplEdu_HSC", "prcplSchemeOther", "isChangeDifficult", "mstTeachNotInterest", "mstTeachPrivateLessons_4.0", "teachNumTraining", "parentEduLevl_secondary", "localTrust_passive", "ruleFollow_passive", "remark_passive", "IMR_2010", "perPassSSC", "haveInternet", "prcplSpouseEdu", "prcplInternetSavy", "prcplchoolAward", "staffNotForcenStopTbcco", "staffRecievdTrainTF", "mstTeachGoodAcademics", "mstTeachParticipatedExCurr", "mstParentsSchoolEvents", "parentsTbccoBigThreat", 'tobaccoFree'])
df_filter = df.filter(["Dist_Chandrapur", "Dist_Yavatmal", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "totalCriteria", "rationYAAY", "HDIRS2001_High", "EngMedium_Eng", "EngMedium_Maratni", "SchoolInfra_Pucca", "localTrust_active", "localTrust_passive", "ruleFollow_passive", "ruleFollow_proactive", "remark_active", "remark_passive", "IMR_2010", "prpclSpouseOccu_House Wife", "prcplSpouseOccu_Officer", "prpclSpouseOccu_Service", "percPassSSC", "haveInternet", "prcplIfChild", "prcplSpouseEdu_BED", "prpclSpouseEdu_HSC", "sports_District Level", "sports_Regional", "prcplSpouseEdu_SSC", "prpclSpouseEdu_none", "prcplSpouseWork", "prcplEdu_HSC", "prpclSchemeOther", "prcplInternetSavy", "prcpllchoolAward", "staffNotForceStopTbcco", "staffRecievdTrainTF", "isChangeDifficult", "mstTeachNotInterest", "mstTeachPrivateLessons_4.0", "mstTeachGoodAcademics", "mstTeachParticpateExCurr", "mstParentsSchoolEvents", "parentsTbccoBigThreat", "teachNumTraining", "parentEduLevl_secondary", "numBlank",'tobaccoFree'])
df_filter.info(verbose=True,show_counts=True)

In [None]:
df_filter.head()

AutoViz library plots for features

In [None]:
AV = AutoViz_Class()
# %matplotlib inline
viz = AV.AutoViz(filename='', dfte=df_filter, sep=',', depVar='tobaccoFree', chart_format ='html', verbose=2, save_plot_dir='plots/html')

## Preparing for the model

Getting x and y variables

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
X = df_filter.drop(['tobaccoFree'], axis=1)
y = df_filter['tobaccoFree']



Original frequency histogram

In [None]:
X.hist(figsize = (30,20))
save_fig("unscaled_bar_plot")  
plt.show()

Scaling features data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X))

Scaled frequency histogram

In [None]:
X.hist(figsize = (30,20))
save_fig("scaled_bar_plot")  
plt.show()

*scaliling with standardization allows for faster convergence by setting mean to 0 and std dev to 1*

## Training Model

### Statsmodels Logistic Regression Model

Adding constant to x

In [None]:
X = sm.add_constant(X)

train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#X_train

Training model

In [None]:
smmodel=sm.Logit(y_train,X_train)
result=smmodel.fit(method='bfgs')


Model Summary

*summary tells us possible quasi-complete separation most likeley due to variables with high levels of collinearity. Those variables should be dropped* 

In [None]:
print(result.summary())

In [None]:
from sklearn.metrics import accuracy_score
yhat = result.predict(X_test)
prediction = list(map(round, yhat))
print('Actual values', list(y_test.values)) 
print('Predictions :', prediction) 
print('Test accuracy = ', accuracy_score(y_test, prediction))

*Refresh before running next model*

### Sklearn Logistic Regression Model

train test split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



Training model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

Get accuracy scores

In [None]:
print('Training set score: {:.4f}'.format(model.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(model.score(X_test, y_test)))

Model Confussion Matrix 

In [None]:
from sklearn.metrics import confusion_matrix

y_pred_test = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_test)

Plotting confusion matrix heatmap

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
save_fig('Heat Map_confusion')

Complete accuracy scores

In [None]:
TN=cm[0,0]
TP=cm[1,1]
FN=cm[1,0]
FP=cm[0,1]
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)

print('The acuuracy of the model = TP+TN/(TP+TN+FP+FN) = ',(TP+TN)/float(TP+TN+FP+FN),'\n',

'The Missclassification = 1-Accuracy = ',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',

'Sensitivity or True Positive Rate = TP/(TP+FN) = ',TP/float(TP+FN),'\n',

'Specificity or True Negative Rate = TN/(TN+FP) = ',TN/float(TN+FP),'\n',

'Positive Predictive value = TP/(TP+FP) = ',TP/float(TP+FP),'\n',

'Negative predictive Value = TN/(TN+FN) = ',TN/float(TN+FN),'\n',

'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',

'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)

In [None]:
y_pred_prob=model.predict_proba(X_test)[:,:]
y_pred_prob_df=pd.DataFrame(data=y_pred_prob, columns=['Prob of no TFS (0)','Prob of TFS (1)'])
y_pred_prob_df.head()

ROC Plot + Area Under Curve score

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

y_pred_prob_yes=model.predict_proba(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_yes[:,1])
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for TFS classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)
save_fig('roc')
roc_auc_score(y_test,y_pred_prob_yes[:,1])

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_prob_yes=model.predict_proba(X_test)
roc_auc_score(y_test,y_pred_prob_yes[:,1])