In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

#loading data
data = sm.datasets.fair.load_pandas().data

# add "affair" column: 1 represents having affairs, 0 represents not
data['affair'] = (dta.affairs > 0).astype(int)

#Prepare Data for Logistic Regression
#To prepare the data, I want to add an intercept column as well as dummy variables for occupation
# and occupation_husb, since I'm treating them as categorial variables. 
#The dmatrices function from the patsy module can do that using formula language.
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
data, return_type="dataframe")

#rename the columns
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting Logistic Regression to the Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# check the accuracy on the training set
print("accuracy of the training set: ",end=" ")
print(classifier.score(X, y))

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
print("confusion matraix:")
conf_m = confusion_matrix(y_test, y_pred)
print(cm)

# generate evaluation metrics
from sklearn.metrics import r2_score

print("accuracy of model:",end=" ")
print(metrics.accuracy_score(y_test, y_pred))
print("classification report: ")
print(metrics.classification_report(y_test, y_pred))

print("the coeficients of features\n ",classifier.coef_)



accuracy of the training set:  0.7260446120012567
confusion matraix:
[[993 107]
 [316 176]]
accuracy of model: 0.7342964824120602
classification report: 
             precision    recall  f1-score   support

        0.0       0.76      0.90      0.82      1100
        1.0       0.62      0.36      0.45       492

avg / total       0.72      0.73      0.71      1592

the coeficients of features
  [[ 1.36544156  0.34345773  0.62862746  0.35430877  0.9664653   0.96382034
   0.09092041  0.1621336   0.05792179  0.04599074  0.03476078 -0.69261696
  -0.05349213  0.10635748  0.00655275 -0.36616779  0.00906973]]
