In [155]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
import numpy as np

df=pd.read_csv("C:\\Users\\Neel\\Desktop\\Greyatom Codes\\Datasets\\Social_Network_Ads.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [156]:
df["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [157]:
df["Gender"]=df["Gender"].astype("category")
df["Gender"]=df["Gender"].cat.codes

### Logistic Regression without handling class imbalance

In [158]:
X=df.drop(["User ID","Purchased"],1)
y=df["Purchased"]

In [159]:
scaler=MinMaxScaler()

In [160]:
X=pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [161]:
X.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,1.0,0.02381,0.02963
1,1.0,0.404762,0.037037
2,0.0,0.190476,0.207407
3,0.0,0.214286,0.311111
4,1.0,0.02381,0.451852


In [162]:
X_train,X_test,y_train,y_test=tts(X,y,test_size=0.3,random_state=42)

In [163]:
logreg_model=LogisticRegression()

In [164]:
logreg_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [165]:
y_pred=logreg_model.predict(X_test)

In [166]:
accuracy_score(y_test,y_pred)

0.85

In [167]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89        73
           1       0.97      0.64      0.77        47

    accuracy                           0.85       120
   macro avg       0.89      0.81      0.83       120
weighted avg       0.87      0.85      0.84       120



### Logistic Regression using SMOTE

In [168]:
from imblearn.over_sampling import SMOTE

In [169]:
sm=SMOTE(random_state=42)

In [170]:
df=df.drop(["User ID"],1)

In [171]:
new_df=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

In [172]:
new_df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1.0,0.02381,0.02963,0.0
1,1.0,0.404762,0.037037,0.0
2,0.0,0.190476,0.207407,0.0
3,0.0,0.214286,0.311111,0.0
4,1.0,0.02381,0.451852,0.0


In [173]:
X=new_df.drop(["Purchased"],1)
y=df["Purchased"]

In [174]:
X_res,y_res=sm.fit_sample(X,y.ravel())

In [175]:
X_train,X_test,y_train,y_test=tts(X_res,y_res,test_size=0.3,random_state=42)

In [176]:
logreg=LogisticRegression()

In [177]:
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [178]:
y_pred=logreg.predict(X_test)

In [179]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82        75
           1       0.84      0.80      0.82        80

    accuracy                           0.82       155
   macro avg       0.82      0.82      0.82       155
weighted avg       0.82      0.82      0.82       155



### Logistic Regression Using GridSearchCV

In [180]:
credit=pd.read_csv("C:\\Users\\Neel\\Desktop\\Greyatom Codes\\Datasets\\creditcard.csv")
credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [181]:
credit.shape

(284807, 31)

In [182]:
credit["Class"].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [183]:
credit["Time"]= sscaler.fit_transform(np.array(credit["Time"]).reshape(-1,1))
credit["Amount"]= sscaler.fit_transform(np.array(credit["Amount"]).reshape(-1,1))

In [184]:
credit.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,-1.050379e-14,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,3.202236e-16,0.001727
std,1.000002,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,1.000002,0.041527
min,-1.996583,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,-0.3532294,0.0
25%,-0.855212,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,-0.3308401,0.0
50%,-0.2131453,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,-0.2652715,0.0
75%,0.9372174,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,-0.04471707,0.0
max,1.642058,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,102.3622,1.0


In [185]:
log_reg=LogisticRegression()

In [186]:
#Considering 1000 random samples of Class 0 and 10 random samples of Class 1. To reduce the time of GridSearchCV
zeros=credit.sample(n=1000,random_state=42)
ones=credit[credit["Class"]==1].sample(n=10,random_state=42)

In [187]:
credit=pd.concat([zeros,ones])

In [188]:
credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43428,-1.122574,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,...,1.190739,-1.12767,-2.358579,0.673461,-1.4137,-0.462762,-2.018575,-1.042804,1.102834,1
49906,-1.064538,0.339812,-2.743745,-0.13407,-1.385729,-1.451413,1.015887,-0.524379,0.22406,0.899746,...,-0.213436,-0.942525,-0.526819,-1.156992,0.311211,-0.746647,0.040996,0.102038,1.726255,0
29474,-1.249364,1.39959,-0.590701,0.168619,-1.02995,-0.539806,0.040444,-0.712567,0.002299,-0.971747,...,0.102398,0.168269,-0.166639,-0.81025,0.505083,-0.23234,0.011409,0.004634,-0.229289,0
276481,1.52268,-0.432071,1.647895,-1.669361,-0.349504,0.785785,-0.630647,0.27699,0.586025,-0.484715,...,0.358932,0.873663,-0.178642,-0.017171,-0.207392,-0.157756,-0.237386,0.001934,-0.347232,0
278846,1.551109,2.01416,-0.137394,-1.015839,0.327269,-0.182179,-0.956571,0.043241,-0.160746,0.363241,...,-0.238644,-0.6164,0.347045,0.061561,-0.360196,0.17473,-0.078043,-0.070571,-0.349671,0


In [189]:
X=credit.drop(["Class"],1)
y=credit["Class"]

In [190]:
X_train,X_val,y_train,y_val=tts(X,y,test_size=0.2,random_state=42)

In [191]:
X=X_train
y=y_train

In [192]:
params={"C":np.arange(0.001,10,0.2),"penalty":["l1","l2"]}

In [193]:
log_reg_cv=GridSearchCV(log_reg,param_grid=params,cv=5)

In [194]:
log_reg_cv.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.000e-03...
       4.801e+00, 5.001e+00, 5.201e+00, 5.401e+00, 5.601e+00, 5.801e+00,
       6.001e+00, 6.201e+00, 6.401e+00, 6.601e+00, 6.801e+00, 7.001e+00,
       7.201e+00, 7.401e+00, 7.601e+00, 7.801e+00, 8.001e+00, 8.201e+00,
       8.401e+00, 8.601e+00, 8.801e+00, 9.001e+00, 9.201e+00, 9.401e+00,
 

In [195]:
log_reg_cv.best_params_

{'C': 0.201, 'penalty': 'l2'}

In [196]:
best_model=log_reg_cv.best_estimator_

In [197]:
y_pred=best_model.predict(X_val)

In [198]:
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       201
           1       0.00      0.00      0.00         1

    accuracy                           1.00       202
   macro avg       0.50      0.50      0.50       202
weighted avg       0.99      1.00      0.99       202



### Applying SMOTE to same problem to remove class imbalance

In [199]:
credit=pd.read_csv("C:\\Users\\Neel\\Desktop\\Greyatom Codes\\Datasets\\creditcard.csv")
credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [200]:
sm=SMOTE(random_state=0)

In [201]:
credit["Time"]= sscaler.fit_transform(np.array(credit["Time"]).reshape(-1,1))
credit["Amount"]= sscaler.fit_transform(np.array(credit["Amount"]).reshape(-1,1))

In [202]:
zeros=credit.sample(n=1000,random_state=42)
ones=credit[credit["Class"]==1].sample(n=10,random_state=42)

In [203]:
short_credit=pd.concat([zeros,ones])

In [204]:
X=short_credit.drop(["Class"],1)
y=short_credit["Class"]

In [205]:
X_res,y_res=sm.fit_sample(X,y)

In [206]:
pd.Series(y_res).value_counts()

1    998
0    998
dtype: int64

In [207]:
logistic=LogisticRegression()
X_train, X_test, y_train, y_test = tts(X_res,y_res, random_state = 42, test_size = 0.3)

In [208]:
params = {'C' : np.arange(0.001,10,0.2), 'penalty' : ['l1','l2']}  
new_log_reg_cv=GridSearchCV(logistic,params,cv=5)

In [209]:
new_log_reg_cv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.000e-03...
       4.801e+00, 5.001e+00, 5.201e+00, 5.401e+00, 5.601e+00, 5.801e+00,
       6.001e+00, 6.201e+00, 6.401e+00, 6.601e+00, 6.801e+00, 7.001e+00,
       7.201e+00, 7.401e+00, 7.601e+00, 7.801e+00, 8.001e+00, 8.201e+00,
       8.401e+00, 8.601e+00, 8.801e+00, 9.001e+00, 9.201e+00, 9.401e+00,
 

In [210]:
best_new_model=new_log_reg_cv.best_estimator_

In [211]:
y_pred=best_new_model.predict(X_test)

In [212]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       301
           1       0.99      1.00      1.00       298

    accuracy                           1.00       599
   macro avg       1.00      1.00      1.00       599
weighted avg       1.00      1.00      1.00       599



In [214]:
accuracy_score(y_test,y_pred)

0.996661101836394