In [2]:
#Classification Problem: To predict the classification for each woman - Having an affair or not

#Import the required libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

#Load the dataset
dta = sm.datasets.fair.load_pandas().data
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [7]:
#Add "affair" column using 'affairs' column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)

#Add an intercept column and add dummy variables for occupation and occupation_husb as they are categorical variables
#Get two matrices - "outcome" or "y" data and "predictor" or "x" data
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + religious + educ + C(occupation) + C(occupation_husb)', dta, return_type="dataframe")

#Rename the Occupation and Husband Occupation columns
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})

In [62]:
#Get a contiguous flattened 1-D array
y = np.ravel(y)

#Instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the entire data set
print("The accuracy of the model on entire data set is: ", model.score(X, y))

The accuracy of the model on entire data set is:  0.72588752749


In [35]:
#Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#Train the model and predict the results
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("The predicted valuse for the model are: ", predictions)

#Generate class probabilities
probs = model.predict_proba(X_test)

#Generate evaluation metrics
print("The accuracy score is: %.2f" %(metrics.accuracy_score(y_test, predictions)))
print("The roc auc score is: %.2f" %(metrics.roc_auc_score(y_test, probs[:, 1])))                                                 

The predicted valuse for the model are:  [ 1.  0.  0. ...,  1.  0.  1.]
The accuracy score is: 0.74
The roc auc score is: 0.75


In [59]:
#Evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
scores, scores.mean()


(array([ 0.72100313,  0.70219436,  0.73824451,  0.70597484,  0.70597484,
         0.72955975,  0.7327044 ,  0.70440252,  0.75157233,  0.75      ]),
 0.7241630685514876)

In [56]:
#Predicting the probability of an offair for a random woman not present in the dataset.
#She's a 25-year-old teacher who graduated college, has been married for 3 years, has 1 child, rates herself as strongly religious, rates her marriage as fair, and her husband is a farmer.
pred_prob = model.predict_proba(np.array([[1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 3, 25, 3, 1, 4, 16]]))

print("The predicted probability of an affair is: %.2f" %(pred_prob[0][1]))


The predicted probability of an affair is: 0.22


In [60]:
#Predicting whether a woman not present in dataset will have an affair or not - 0: Affair = No; 1: Affair = Yes
#She's a 25-year-old teacher who graduated college, has been married for 3 years, has 1 child, rates herself as strongly religious, rates her marriage as fair, and her husband is a farmer.
pred_1 = model.predict(np.array([[1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 3, 25, 3, 1, 4, 16]]))

print("The predicted value of an affair for given woman is: %.0f" %pred_1)

The predicted value of an affair for given woman is: 0
