In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
df = pd.read_csv('Customer_Behaviour.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [18]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [19]:
df.Gender = df.Gender.replace(['Female', 'Male'], [0,1])
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [28]:
df.describe()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0,400.0
mean,15691540.0,0.49,37.655,69742.5,0.3575
std,71658.32,0.500526,10.482877,34096.960282,0.479864
min,15566690.0,0.0,18.0,15000.0,0.0
25%,15626760.0,0.0,29.75,43000.0,0.0
50%,15694340.0,0.0,37.0,70000.0,0.0
75%,15750360.0,1.0,46.0,88000.0,1.0
max,15815240.0,1.0,60.0,150000.0,1.0


In [29]:
x = df.drop(['User ID','Purchased'], axis=1).values
y = df.Purchased.values

In [30]:
x_norm = StandardScaler().fit_transform(x)
x_norm = pd.DataFrame(x_norm)
x_norm.head()

Unnamed: 0,0,1,2
0,1.020204,-1.781797,-1.490046
1,1.020204,-0.253587,-1.460681
2,-0.980196,-1.113206,-0.78529
3,-0.980196,-1.017692,-0.374182
4,1.020204,-1.781797,0.183751


In [31]:
summary = sm.Logit(y, sm.add_constant(x_norm)).fit()
print(summary.summary())

Optimization terminated successfully.
         Current function value: 0.344804
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  400
Model:                          Logit   Df Residuals:                      396
Method:                           MLE   Df Model:                            3
Date:                Fri, 16 Apr 2021   Pseudo R-squ.:                  0.4711
Time:                        19:58:10   Log-Likelihood:                -137.92
converged:                       True   LL-Null:                       -260.79
Covariance Type:            nonrobust   LLR p-value:                 5.488e-53
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1555      0.171     -6.753      0.000      -1.491      -0.820
0              0.1669      0.

In [32]:
ds = x_norm.drop(columns=[0])
ds

Unnamed: 0,1,2
0,-1.781797,-1.490046
1,-0.253587,-1.460681
2,-1.113206,-0.785290
3,-1.017692,-0.374182
4,-1.781797,0.183751
...,...,...
395,0.797057,-0.844019
396,1.274623,-1.372587
397,1.179110,-1.460681
398,-0.158074,-1.078938


In [41]:
summary = sm.Logit(y, sm.add_constant(ds)).fit()
print(summary.summary())

Optimization terminated successfully.
         Current function value: 0.346314
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  400
Model:                          Logit   Df Residuals:                      397
Method:                           MLE   Df Model:                            2
Date:                Fri, 16 Apr 2021   Pseudo R-squ.:                  0.4688
Time:                        20:13:34   Log-Likelihood:                -138.53
converged:                       True   LL-Null:                       -260.79
Covariance Type:            nonrobust   LLR p-value:                 7.995e-54
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1381      0.169     -6.730      0.000      -1.470      -0.807
1              2.4446      0.

In [37]:
x_train, x_test, y_train, y_test = train_test_split(ds, y, test_size=0.5, random_state=0)

lr = LogisticRegression()
LogReg = lr.fit(x_train, y_train)

LogReg.intercept_

array([-1.2626125])

In [38]:
pred = lr.predict(x_test)

y_pred_proba = lr.predict_proba(x_test)[:,1]

df_prob = pd.DataFrame(x_test)
df_prob['Probability Value'] = y_pred_proba
df_prob['Probability'] = pred

df_prob

Unnamed: 0,1,2,Probability Value,Probability
132,-0.731153,0.506764,0.076017,0
309,0.032952,-0.579736,0.158290,0
341,-0.253587,0.154386,0.153318,0
196,-0.731153,0.271845,0.063358,0
246,-0.253587,-0.579736,0.089436,0
...,...,...,...,...
217,0.223978,0.242480,0.365208,0
156,0.797057,-0.315452,0.569872,1
212,2.038728,-0.814655,0.935838,1
376,0.797057,0.125021,0.656643,1


In [39]:
print(df_prob['Probability'].value_counts())

0    142
1     58
Name: Probability, dtype: int64


In [40]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87       123
           1       0.86      0.65      0.74        77

    accuracy                           0.82       200
   macro avg       0.84      0.79      0.80       200
weighted avg       0.83      0.82      0.82       200

