# Binary Predictors (Logistic Regression)

### Import Library

In [77]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Load Data

In [78]:
file_path = 'C:/Users/iolley2/Desktop/DS Contd/Logistic Regression/Examples/2.02.+Binary+predictors.csv'
raw_data = pd.read_csv(file_path)
raw_data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [79]:
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'Yes':1,'No':0})
data['Gender'] = data['Gender'].map({'Female':1,'Male':0})
data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


### Variables

In [80]:
y = data['Admitted']
x1 = data['Gender']

### Regression

In [81]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


  x = pd.concat(x[::order], 1)


In [82]:
results_log.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 04 Apr 2022",Pseudo R-squ.:,0.1659
Time:,13:18:29,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


Interpreting 'b1' coefficients

The log of odds of female getting admitted to odds of male getting admitted is 2.08

In [83]:
np.exp(2.0786)

7.993270498536442

The odds of a female getting admitted is 7.99 times the odds of male

## New Regression

Including gender

In [84]:
y2 = data['Admitted']
x12 = data[['SAT','Gender']]

In [85]:
x2 = sm.add_constant(x12)
reg_log2 = sm.Logit(y2,x2)
results_log2 = reg_log2.fit()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


  x = pd.concat(x[::order], 1)


In [86]:
results_log2.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 04 Apr 2022",Pseudo R-squ.:,0.8249
Time:,13:18:30,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


Log-Likelihood is higher, this indicates it is a better model

Gender is still significant

In [87]:
np.exp(1.9449)

6.992932526814459

Given the same SAT score, a female is 7 times more likely to be admitted than a male

It could mean that some of the courses been admitted to are mostly taken by female

### Accuracy

Predicted values

In [88]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
results_log2.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

###### The values represents probabilities

Actual values

In [89]:
np.array(data['Admitted'])

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [90]:
results_log2.pred_table()

array([[69.00, 5.00],
       [4.00, 90.00]])

In [91]:
cm_df = pd.DataFrame(results_log2.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0:'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


For 69 observations, the model predicted 0 when the outcome was 0

For 5 observations, the model predicted 0 when the outcome was 1

For 4 observations, the model predicted 0 when the outcome was 1

For 90 observations, the model predicted 1 when the outcome was 1

In [92]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.9464285714285714

#### Testing and its accuracy

In [93]:
file_path2 = 'C:/Users/iolley2/Desktop/DS Contd/Logistic Regression/Examples/2.03.+Test+dataset.csv'
test = pd.read_csv(file_path2)
test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male


In [94]:
test['Admitted'] = test['Admitted'].map({'Yes':1,'No':0})
test['Gender'] = test['Gender'].map({'Female':1,'Male':0})

In [95]:
x2

Unnamed: 0,const,SAT,Gender
0,1.0,1363,0
1,1.0,1792,1
2,1.0,1954,1
3,1.0,1653,0
4,1.0,1593,0
...,...,...,...
163,1.0,1722,1
164,1.0,1750,0
165,1.0,1555,0
166,1.0,1524,0


In [96]:
test_actual = test['Admitted']
test_data = test.drop(['Admitted'],axis = 1)
test_data = sm.add_constant(test_data)
test_data.head()

  x = pd.concat(x[::order], 1)


Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0


In [97]:
def confusion_matrix(data,actual_values,model):

        pred_values = model.predict(data)
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

In [98]:
cm2 = confusion_matrix(test_data,test_actual,results_log2)
cm2

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 0.8947368421052632)

In [99]:
cm_df2 = pd.DataFrame(cm2[0])
cm_df2.columns = ['Predicted 0', 'Predicted 1']
cm_df2 = cm_df2.rename(index = {0:'Actual 0',1:'Actual 1'})
cm_df2

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0


In [100]:
print('Missclassification Rate:',str((1+1)/19))

Missclassification Rate: 0.10526315789473684


Also 1 - accuracy