In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import cross_val_score
#returns record array where columns are accesible using column names
dta = sm.datasets.fair.load_pandas().data
# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)
y, X = \
dmatrices('affair ~ rate_marriage + age + yrs_married + children + religious + educ + C(occupation) + C(occupation_husb)',dta, return_type="dataframe")
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y)
print(X.shape)
print(y.shape)

#Split into train, test
test_size = X.shape[0] // 7
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = test_size, random_state = 0)

#Create model, fit model and make predictions
lgr = LogisticRegression()
model = lgr.fit(train_x, train_y)
pred_y = model.predict(test_x)

#Evaluate performance of the model
print("Cross Validation Score with 1/5 fold cross validation is: ", cross_val_score(lgr, X, y, cv = 5).mean())

target_names = ['No affair', 'Had affair']
print(classification_report(test_y, pred_y, target_names=target_names))

precision, recall, threshold = precision_recall_curve(test_y, pred_y)
print("precision: {}, recall: {}, threshold: {}".format(precision  ,recall, threshold))
print("average_precision_score: ", average_precision_score(test_y, pred_y))      


(6366, 17)
(6366,)
Cross Validation Score with 1/5 fold cross validation is:  0.7249513738732067
             precision    recall  f1-score   support

  No affair       0.75      0.90      0.82       627
 Had affair       0.62      0.35      0.45       282

avg / total       0.71      0.73      0.71       909

precision: [0.31023102 0.62025316 1.        ], recall: [1.         0.34751773 0.        ], threshold: [0. 1.]
average_precision_score:  0.41796921410428145
