In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

#Import data

totalcol = pd.read_csv('totalcol.csv')

#Calculate the number of responses for those that have visited the library ("Yeses"), and those that haven't ("Nos")
Yeses = totalcol[totalcol['has_visited']=='Yes']
Nos = totalcol[totalcol['has_visited']=='No']
print("Yeses:", Yeses.shape)
print("Nos:",Nos.shape)
print("Total:", totalcol.shape)

#print column names
totalcol.columns

Yeses: (1639, 27)
Nos: (365, 27)
Total: (2004, 27)


Index(['Unnamed: 0', 'sex', 'age', 'marital', 'is_parent', 'education_level',
       'emplnw', 'disabled', 'party', 'ideology', 'race', 'income', 'hh1',
       'number_in_hh', 'reg_voter', 'email_use', 'mobile_phone', 'home_int',
       'broadband', 'smartphone', 'library_onsite', 'library_website',
       'visit_freq', 'educ2_ordinal', 'inc_ordinal', 'libusea_ordinal',
       'has_visited'],
      dtype='object')

In [126]:
#Assign label to y
y = totalcol['has_visited']

#remove duplicative columns (say the same thing as the label (causes leakage)), and label column (y)
columns_to_drop = ['library_onsite', 'library_website', 'visit_freq', 'has_visited','libusea_ordinal']

totalcol.drop(columns_to_drop, axis = 1, inplace = True, errors = 'ignore')
totalcol.columns



Index(['Unnamed: 0', 'sex', 'age', 'marital', 'is_parent', 'education_level',
       'emplnw', 'disabled', 'party', 'ideology', 'race', 'income', 'hh1',
       'number_in_hh', 'reg_voter', 'email_use', 'mobile_phone', 'home_int',
       'broadband', 'smartphone', 'educ2_ordinal', 'inc_ordinal'],
      dtype='object')

In [127]:
#convert remaining categorical columns to dummy values (1,0)

columns_to_shift = ['sex','income','marital', 'is_parent', 'education_level', 'emplnw', 'disabled', 'party', 'race','ideology', 'number_in_hh', 'reg_voter', 'email_use', 'mobile_phone', 'home_int', 'broadband', 'smartphone']

totalcol_origin = pd.get_dummies(totalcol, columns=columns_to_shift)

In [128]:
#Divide dataset assign dataframe to variable X (features)

X = pd.DataFrame(totalcol_origin)
X.head()

Unnamed: 0.1,Unnamed: 0,age,hh1,educ2_ordinal,inc_ordinal,sex_Female,sex_Male,income_$100K-$150K,income_$10K-$20K,income_$150K+,...,broadband_Cell Phone Only,broadband_Dial-Up,broadband_Higher Speed,broadband_None,broadband_Refused,broadband_Unknown,smartphone_No,smartphone_Refused,smartphone_Unknown,smartphone_Yes
0,0,30,1,4,2,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,1
1,1,77,1,6,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,2,45,4,6,2,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,3,23,4,6,7,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,4,25,1,6,2,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [129]:
#Train/Test Split

train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7, test_size = 0.30)

# Train and Test dataset size details
print("Train_x Shape :: ", train_x.shape)
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)
 

Train_x Shape ::  (1402, 109)
Train_y Shape ::  (1402,)
Test_x Shape ::  (602, 109)
Test_y Shape ::  (602,)


In [130]:
#Do a grid search to find best combination of parameters

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
                 'n_estimators': [5, 10, 15, 20],
                 'max_leaf_nodes': [40, 50, 60, 70],
             }
clf = RandomForestClassifier(max_features=60)

grid_clf = GridSearchCV(clf, param_grid, cv=10)
grid_clf.fit(train_x, train_y)

print("Grid Params:", grid_clf.best_params_)
print("Best Estimator:", grid_clf.best_estimator_)

Grid Params: {'max_leaf_nodes': 40, 'n_estimators': 20}
Best Estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=60, max_leaf_nodes=40,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [131]:
#run the model with folding and best parameters
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators = 20, max_features=40, max_leaf_nodes=60, random_state=0)

# 10-Fold Cross validation
print(np.mean(cross_val_score(clf, train_x, train_y, cv=10)))



0.8245557717959372


In [104]:
#test models

clf.fit(train_x, train_y)
predictions = clf.predict(test_x)

# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print("AUC_ROC:", multiclass_roc_auc_score(test_y, predictions))
pd.crosstab(test_y, predictions, rownames=['True'], colnames=['Predicted'], margins=True)

Train Accuracy ::  0.9065620542082738
Test Accuracy  ::  0.8205980066445183
AUC_ROC: 0.5649202303407911


Predicted,No,Yes,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,18,89,107
Yes,19,476,495
All,37,565,602


In [105]:
from sklearn.ensemble import AdaBoostClassifier

adclf = AdaBoostClassifier(n_estimators=18, learning_rate=1, random_state=0)

# Train model
model = adclf.fit(train_x, train_y)

adpredictions = adclf.predict(test_x)

# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(train_y, adclf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, adpredictions))
print("AUC_ROC:", multiclass_roc_auc_score(test_y, adpredictions))
pd.crosstab(test_y, adpredictions, rownames=['True'], colnames=['Predicted'], margins=True)

Train Accuracy ::  0.8373751783166904
Test Accuracy  ::  0.8222591362126246
AUC_ROC: 0.5769187199093742


Predicted,No,Yes,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,21,86,107
Yes,21,474,495
All,42,560,602


In [106]:
from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(random_state=0)

gbclf.fit(train_x, train_y)

gbpredictions = gbclf.predict(test_x)

# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(train_y, gbclf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, gbpredictions))
print("AUC_ROC:", multiclass_roc_auc_score(test_y, gbpredictions))
pd.crosstab(test_y, gbpredictions, rownames=['True'], colnames=['Predicted'], margins=True)

Train Accuracy ::  0.8873038516405135
Test Accuracy  ::  0.8222591362126246
AUC_ROC: 0.5622675351647314


Predicted,No,Yes,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,17,90,107
Yes,17,478,495
All,34,568,602


In [107]:
from sklearn.neighbors import KNeighborsClassifier

knclf = KNeighborsClassifier(n_neighbors = 11)

knclf.fit(train_x,train_y)

knpredictions = knclf.predict(test_x)

# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(train_y, knclf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, knpredictions))
print("AUC_ROC:", multiclass_roc_auc_score(test_y, knpredictions))
pd.crosstab(test_y, knpredictions, rownames=['True'], colnames=['Predicted'], margins=True)

Train Accuracy ::  0.8231098430813124
Test Accuracy  ::  0.8189368770764119
AUC_ROC: 0.5199565750967621


Predicted,No,Yes,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,6,101,107
Yes,8,487,495
All,14,588,602
