In [34]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import statsmodels.api as sm

In [35]:
# This dataset you have seen, but this time we will properly split the data from the actual
# dataset into two, and fit the model on the train dataset, and test on the test dataset.
# Then we will iterate through class thresholds, to see which threshold gives the best confusion
# matrix. The first steps have been done for you 
# (creating dummies, joining to df, creating y series and features only dataframe 
# but please be familiar with these first steps! 

df = pd.read_csv("../../assets/admissions.csv").dropna()
dummies = pd.get_dummies( df["prestige"], prefix = "prestige" )


join = df[ df.columns[0:3] ].join(dummies)
join["intercept"] = 1

print join.head(), join.shape

joinColumns = join.columns
y = join.admit
X = join[ joinColumns[1:3] + joinColumns[4:] ]

   admit    gre   gpa  prestige_1.0  prestige_2.0  prestige_3.0  prestige_4.0  \
0      0  380.0  3.61           0.0           0.0           1.0           0.0   
1      1  660.0  3.67           0.0           0.0           1.0           0.0   
2      1  800.0  4.00           1.0           0.0           0.0           0.0   
3      1  640.0  3.19           0.0           0.0           0.0           1.0   
4      0  520.0  2.93           0.0           0.0           0.0           1.0   

   intercept  
0          1  
1          1  
2          1  
3          1  
4          1   (397, 8)




In [36]:
import seaborn as sns
%matplotlib

#sns.lmplot('prestige_3.0', 'admit', join, logistic=True)
#sns.heatmap(join.corr())
join.corr()

Using matplotlib backend: MacOSX


<seaborn.axisgrid.FacetGrid at 0x11d50a810>

In [37]:
# split data randomly into datasets, 70% train, 30% test using test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [38]:
# fit the model using statsmodels.api.sm

logit = sm.Logit(y_train, X_train)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.540877
         Iterations 6


0,1,2,3
Dep. Variable:,admit,No. Observations:,277.0
Model:,Logit,Df Residuals:,271.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 30 Jun 2016",Pseudo R-squ.:,0.1002
Time:,12:35:43,Log-Likelihood:,-149.82
converged:,True,LL-Null:,-166.5
,,LLR p-value:,3.202e-06

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gpa,1.0557,0.431,2.451,0.014,0.211 1.900
gre,0.0031,0.001,2.257,0.024,0.000 0.006
intercept,-5.8192,1.521,-3.827,0.000,-8.800 -2.839
prestige_2.0,-0.1663,0.397,-0.419,0.675,-0.944 0.612
prestige_3.0,-1.1084,0.439,-2.527,0.011,-1.968 -0.249
prestige_4.0,-1.1220,0.502,-2.234,0.025,-2.106 -0.138


In [39]:
# odds ratios only
print np.exp(result.params)

params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)

gpa             2.873924
gre             1.003144
intercept       0.002970
prestige_2.0    0.846806
prestige_3.0    0.330093
prestige_4.0    0.325640
dtype: float64
                  2.5%     97.5%        OR
gpa           1.235445  6.685396  2.873924
gre           1.000413  1.005882  1.003144
intercept     0.000151  0.058510  0.002970
prestige_2.0  0.388925  1.843751  0.846806
prestige_3.0  0.139742  0.779731  0.330093
prestige_4.0  0.121679  0.871484  0.325640


In [40]:
# add y_test as a new column in X_test, and then make another dataframe called dfTrain
# and set it equal to X_test, after X_test has the new y_test column

X_test["actualAdmit"] = y_test
X_test.head()
dfTrain = X_test

print dfTrain.columns

# create a new column in dfTrain that is the predicted admitance value using the result logit model
# note you will need a dataframe with only the features (including intercept)
# note the dummy column has already been removed

dfTrain['predictedAdmit'] = result.predict( dfTrain[ dfTrain.columns[0:6] ] )

Index([u'gpa', u'gre', u'intercept', u'prestige_2.0', u'prestige_3.0',
       u'prestige_4.0', u'actualAdmit'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [41]:
# create a function call scale predictor, that will take two parameters called "prob" and "threshold"
# the function will check if the probability is greater than or equal to the threshold, 
# return 1, else return 0

def scalePredictor(prob, threshold):
    if( prob >= threshold ):
        return 1
    return 0

# create a while loop, starting at i = 0.30 and ending i <= 0.50, in increments of 0.01. 
# In this while loop you will create a new predictionAdmit_Threshold column in every iteration
# this column will be populated by using scalePredictor each time
# after the new column is populated, print out a confusion matrix (use crosstab (within the loop!) )
# note the first parameter in crosstab will always be dfTrain['actualAdmit'] while the second
# parameter will be the new column in that iteration
# interpret each iteration, and decide on the best threshold in each iteration.

i = 0.30
while i <= 0.50:
    
    dfTrain[ 'predictedAdmit_{}'.format(i) ] = dfTrain.predictedAdmit.apply(
        lambda prob: scalePredictor(prob,i) )
    
    print pd.crosstab(
        dfTrain['actualAdmit'],
        dfTrain['predictedAdmit_{}'.format(i)], 
        rownames=['admit']
    )
    i += 0.01


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


predictedAdmit_0.3   0   1
admit                     
0                   39  35
1                   17  29
predictedAdmit_0.31   0   1
admit                      
0                    42  32
1                    18  28
predictedAdmit_0.32   0   1
admit                      
0                    44  30
1                    19  27
predictedAdmit_0.33   0   1
admit                      
0                    44  30
1                    20  26
predictedAdmit_0.34   0   1
admit                      
0                    46  28
1                    21  25
predictedAdmit_0.35   0   1
admit                      
0                    48  26
1                    21  25
predictedAdmit_0.36   0   1
admit                      
0                    52  22
1                    21  25
predictedAdmit_0.37   0   1
admit                      
0                    54  20
1                    23  23
predictedAdmit_0.38   0   1
admit                      
0                    56  18
1                    24 