# Classification for Prediction

<p> Here, we want to classify cheaters. Today's data set reports the results of an old survey of married people. Each row is associated to a person, whereas the columns report:
<ul>
<li><b>age, children</b>: age and number of children
<li><b>religious</b>: the person's religiousness
<li><b>educ</b>: the person's education level 
<li><b>occupation</b>: a code that identifies the person's occupation
<li><b>rate_marriage</b>: how the person rates his or her marriage,
<li><b>yrs_married</b>: length of the marriage, in years
<li><b>affairs</b>: time spent, in hours/week, in extra-marital affairs 
</ul></p>
<p>That is all we need to know about the columns' meaning.</p>

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%pylab inline

import sklearn as sk

Populating the interactive namespace from numpy and matplotlib


In [3]:
df = pd.read_csv('affairs.csv', index_col=0)

# Clean Data

Make a binary attribute to indicate cheaters

In [4]:
df['affairsBin'] = (df.affairs > 0)*1.0

In [5]:
df.occupation.unique()

array([2, 3, 5, 4, 1, 6], dtype=int64)

Dummy variable for occupation

In [6]:
df = pd.get_dummies(df,columns=['occupation'])

Make X and Y

In [7]:
X = df.drop(['affairsBin','affairs'],axis=1)

In [8]:
Y = df.affairsBin

# Hold-out sample

Split the data into 70% training and 30% test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state = 0)


### Train on the trainining set

In [11]:
from sklearn.ensemble import RandomForestClassifier

cl = RandomForestClassifier(random_state = 0)
cl.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
len(X_train)

4456

In [13]:
len(X_test)

1910

### predict on the test set

In [14]:
y_pred = cl.predict(X_test)

In [16]:
y_pred[:20]

array([ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  1.,  1.,  0.])

In [17]:
y_pred_proba = cl.predict_proba(X_test)[:,1]

In [18]:
y_pred_proba[:20]

array([ 1.        ,  0.        ,  0.2       ,  0.8       ,  0.165     ,
        0.1       ,  0.45      ,  0.        ,  0.9       ,  0.3       ,
        0.03      ,  0.        ,  0.13457949,  0.        ,  0.82833333,
        0.2       ,  0.1025641 ,  0.8       ,  0.525     ,  0.        ])

Why we need AUC:

For every classification threshold, you obtain a different confusion matrix ==> you obtain a different specificity and sensitivity

### collect scores

#### Confusion matrix

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
confusion_matrix(y_pred,Y_test)

array([[1033,  362],
       [ 270,  245]], dtype=int64)

#### Accuracy

In [26]:
(y_pred == Y_test).mean()

0.66910994764397902

#### Precision

In [29]:
import sklearn.metrics as met

In [30]:
met.precision_score(Y_test,y_pred)

0.47572815533980584

In [31]:
Y_test.mean()

0.31780104712041884

#### Recall

In [32]:
met.recall_score(Y_test,y_pred)

0.40362438220757824

#### AUC score

In [33]:
met.roc_auc_score(Y_test,y_pred_proba)

0.66339748217584316

# Cross-validation

In [34]:
nfolds = 10

In [35]:
from sklearn.model_selection import KFold

In [36]:
kf = KFold(n_splits=nfolds,random_state=0,shuffle=True)

In [40]:
avgCV_AUC = sklearn.model_selection.cross_val_score(cl,X,Y,cv=kf,n_jobs=-1,scoring='roc_auc').mean()

# Which classifier obtains the highest performance?

Here is a list of classifiers

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clfs = [DecisionTreeClassifier(), sk.ensemble.RandomForestClassifier(n_jobs=-1), sk.naive_bayes.GaussianNB(),
        sk.linear_model.LogisticRegression(n_jobs=-1),sk.tree.DecisionTreeClassifier(),sk.ensemble.AdaBoostClassifier(),
        QuadraticDiscriminantAnalysis(),MLPClassifier(),SVC()]

Let's find the best one in terms of average AUC

In [42]:
maxAUC = -1
bestCL = ""
for cl in clfs:
    auc = sklearn.model_selection.cross_val_score(cl,X,Y,cv=kf,n_jobs=-1,scoring='roc_auc').mean()
    print (str(cl) + ' ' + str(auc))
    if auc > maxAUC:
        bestCL = cl
        maxAUC = auc
print ('Best is... ' + str(bestCL) + ' ' + str(maxAUC))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 0.602785642139
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 0.670491857704
GaussianNB(priors=None) 0.71089027613
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
        