In [1]:
import numpy as np 
import pandas as pd 
import statsmodels.api as sm
import matplotlib.pyplot as plt 
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression # importing linear regression from scipy
from sklearn.model_selection import train_test_split # good for spliting data into train-test split
from sklearn import metrics # importing metrics to check the performance of our model
from sklearn.model_selection import cross_val_score # useful for cross validation of data
data = sm.datasets.fair.load_pandas().data # loading data into the data frame

  from pandas.core import datetools


In [2]:
data.head() 

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [3]:
data['affair']=(data.affairs>0).astype(int)
data.head() 

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [4]:
data['occupation']=data['occupation'].values.astype(int) # converting occupation column of data to int type
data['occupation_husb']=data['occupation_husb'].values.astype(int) # converting occupation_husb column of data to int type

data_occup=pd.get_dummies(data.occupation,prefix='occ') # seprating categorical variable with n value into n-1 column
data_occup_husb=pd.get_dummies(data.occupation_husb,prefix='occ_hus') # seprating categorical variable with n value into n-1 value

print(data_occup.head())
print(100*"-")
print(data_occup_husb.head())

   occ_1  occ_2  occ_3  occ_4  occ_5  occ_6
0      0      1      0      0      0      0
1      0      0      1      0      0      0
2      0      0      1      0      0      0
3      0      0      0      0      1      0
4      0      0      1      0      0      0
----------------------------------------------------------------------------------------------------
   occ_hus_1  occ_hus_2  occ_hus_3  occ_hus_4  occ_hus_5  occ_hus_6
0          0          0          0          0          1          0
1          0          0          0          1          0          0
2          0          0          0          0          1          0
3          0          0          0          0          1          0
4          0          0          0          1          0          0


In [5]:
data=pd.concat([data,data_occup,data_occup_husb],axis=1)
data.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair,...,occ_3,occ_4,occ_5,occ_6,occ_hus_1,occ_hus_2,occ_hus_3,occ_hus_4,occ_hus_5,occ_hus_6
0,3.0,32.0,9.0,3.0,3.0,17.0,2,5,0.111111,1,...,0,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,3,4,3.230769,1,...,1,0,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,3,5,1.4,1,...,1,0,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,5,5,0.727273,1,...,0,0,1,0,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,3,4,4.666666,1,...,1,0,0,0,0,0,0,1,0,0


In [6]:
data.drop(['occupation','occupation_husb'],axis=1,inplace=True) # droping original columns from the data
print(data.columns.values) # printing the data columns name
print(100*'-')
data.head()

['rate_marriage' 'age' 'yrs_married' 'children' 'religious' 'educ'
 'affairs' 'affair' 'occ_1' 'occ_2' 'occ_3' 'occ_4' 'occ_5' 'occ_6'
 'occ_hus_1' 'occ_hus_2' 'occ_hus_3' 'occ_hus_4' 'occ_hus_5' 'occ_hus_6']
----------------------------------------------------------------------------------------------------


Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,affairs,affair,occ_1,occ_2,occ_3,occ_4,occ_5,occ_6,occ_hus_1,occ_hus_2,occ_hus_3,occ_hus_4,occ_hus_5,occ_hus_6
0,3.0,32.0,9.0,3.0,3.0,17.0,0.111111,1,0,1,0,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,3.230769,1,0,0,1,0,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,1.4,1,0,0,1,0,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,0.727273,1,0,0,0,0,1,0,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,4.666666,1,0,0,1,0,0,0,0,0,0,1,0,0


In [7]:
data.drop(['affairs'],axis=1,inplace=True) # deleting the column affairs from dataframe data
print(data.columns.values)
data.head()

['rate_marriage' 'age' 'yrs_married' 'children' 'religious' 'educ' 'affair'
 'occ_1' 'occ_2' 'occ_3' 'occ_4' 'occ_5' 'occ_6' 'occ_hus_1' 'occ_hus_2'
 'occ_hus_3' 'occ_hus_4' 'occ_hus_5' 'occ_hus_6']


Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,affair,occ_1,occ_2,occ_3,occ_4,occ_5,occ_6,occ_hus_1,occ_hus_2,occ_hus_3,occ_hus_4,occ_hus_5,occ_hus_6
0,3.0,32.0,9.0,3.0,3.0,17.0,1,0,1,0,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,1,0,0,1,0,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,1,0,0,0,0,1,0,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,1,0,0,1,0,0,0,0,0,0,1,0,0


In [8]:
x=data.loc[:,['rate_marriage','age','yrs_married','children','religious','educ',
              'occ_1','occ_2','occ_3','occ_4','occ_5','occ_6','occ_hus_1',
             'occ_hus_2','occ_hus_3','occ_hus_4','occ_hus_5','occ_hus_6']] # independent variable x
y=data.loc[:,['affair']] # dependnt variable y
print(x.head()) # printing independent variable x
print(y.head()) # printing dependent variable y

   rate_marriage   age  yrs_married  children  religious  educ  occ_1  occ_2  \
0            3.0  32.0          9.0       3.0        3.0  17.0      0      1   
1            3.0  27.0         13.0       3.0        1.0  14.0      0      0   
2            4.0  22.0          2.5       0.0        1.0  16.0      0      0   
3            4.0  37.0         16.5       4.0        3.0  16.0      0      0   
4            5.0  27.0          9.0       1.0        1.0  14.0      0      0   

   occ_3  occ_4  occ_5  occ_6  occ_hus_1  occ_hus_2  occ_hus_3  occ_hus_4  \
0      0      0      0      0          0          0          0          0   
1      1      0      0      0          0          0          0          1   
2      1      0      0      0          0          0          0          0   
3      0      0      1      0          0          0          0          0   
4      1      0      0      0          0          0          0          1   

   occ_hus_5  occ_hus_6  
0          1          0  
1   

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42); # dividing data into training and test split
y_train=y_train['affair'].values
y_test=y_test['affair'].values
# printing the shape of train and test data
print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)

x_train:  (4774, 18)
x_test:  (1592, 18)
y_train:  (4774,)
y_test:  (1592,)


In [10]:
regressor = LogisticRegression() # regressor object of logisticregression class
regressor.fit(x_train,y_train) # making the model on training data

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
y_pred=regressor.predict(x_test); # predicting the value on test data 
accuracy=metrics.accuracy_score(y_test,y_pred) # checking the accuracy of prediction
print("accuracy of our model is ",accuracy) # printing the accuracy
metrics.confusion_matrix(y_test,y_pred) # printing the confusion matrix

accuracy of our model is  0.719221105528


array([[948, 107],
       [340, 197]], dtype=int64)