In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix ,f1_score, precision_score, recall_score, auc
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [73]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,13829,29,technician,single,tertiary,no,18254,no,no,cellular,11,may,2,-1,0,unknown,no
1,22677,26,services,single,secondary,no,512,yes,yes,unknown,5,jun,3,-1,0,unknown,no
2,10541,30,management,single,secondary,no,135,no,no,cellular,14,aug,2,-1,0,unknown,no
3,13689,41,technician,married,unknown,no,30,yes,no,cellular,10,jul,1,-1,0,unknown,no
4,11304,27,admin.,single,secondary,no,321,no,yes,unknown,2,sep,1,-1,0,unknown,no


In [74]:
df.shape

(12870, 17)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


In [76]:
df.isnull().sum()

ID           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [77]:
df.job.value_counts()

management       2858
blue-collar      2571
technician       2141
admin.           1464
services         1043
retired           770
self-employed     454
unemployed        414
entrepreneur      383
student           358
housemaid         334
unknown            80
Name: job, dtype: int64

In [78]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['job'] = le.fit_transform(df['job'])
df['marital'] = le.fit_transform(df['marital'])
df['education'] = le.fit_transform(df['education'])
df['default'] = le.fit_transform(df['default'])
df['housing'] = le.fit_transform(df['housing'])
df['loan'] = le.fit_transform(df['loan'])
df['contact'] = le.fit_transform(df['contact'])
df['month'] = le.fit_transform(df['month'])
df['poutcome'] = le.fit_transform(df['poutcome'])
df['y'] = le.fit_transform(df['y'])

In [79]:
df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,13829,29,9,2,2,0,18254,0,0,0,11,8,2,-1,0,3,0
1,22677,26,7,2,1,0,512,1,1,2,5,6,3,-1,0,3,0
2,10541,30,4,2,1,0,135,0,0,0,14,1,2,-1,0,3,0
3,13689,41,9,1,3,0,30,1,0,0,10,5,1,-1,0,3,0
4,11304,27,0,2,1,0,321,0,1,2,2,11,1,-1,0,3,0


In [80]:
X = df.drop(columns=['y'])
y = df['y']

In [81]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [82]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [83]:
pred = lr.predict(x_train)
lr_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
print("Train Result:\n================================================")
print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{lr_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
print("_______________________________________________")
print(f'Recall Score: \n {recall_score(y_train, pred)}')
print("_______________________________________________")
print(f'Roc AUC Score: \n {roc_auc_score(y_train, pred)}')


Train Result:
Accuracy Score: 68.99%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.704018     0.506955  0.689866     0.605487      0.642705
recall        0.948598     0.117017  0.689866     0.532808      0.689866
f1-score      0.808210     0.190145  0.689866     0.499177      0.615909
support    6206.000000  2803.000000  0.689866  9009.000000   9009.000000
_______________________________________________
Confusion Matrix: 
 [[5887  319]
 [2475  328]]

_______________________________________________
Recall Score: 
 0.11701748127006778
_______________________________________________
Roc AUC Score: 
 0.5328078060555946


In [84]:
pred = lr.predict(x_test)
lr_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{lr_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
print("_______________________________________________")
print(f'Recall Score: \n {recall_score(y_test, pred)}')
print("_______________________________________________")
print(f'Roc AUC Score: \n {roc_auc_score(y_test, pred)}')

Test Result:
Accuracy Score: 70.78%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.718585     0.566176  0.707848     0.642381      0.672637
recall        0.956248     0.132302  0.707848     0.544275      0.707848
f1-score      0.820554     0.214485  0.707848     0.517519      0.637838
support    2697.000000  1164.000000  0.707848  3861.000000   3861.000000
_______________________________________________
Confusion Matrix: 
 [[2579  118]
 [1010  154]]

_______________________________________________
Recall Score: 
 0.1323024054982818
_______________________________________________
Roc AUC Score: 
 0.5442750440542947


In [85]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_data = scaler.fit_transform(X)
scaled_data

array([[-0.7012989 , -1.06952791,  1.4095278 , ..., -0.44574006,
        -0.33614417,  0.48099775],
       [ 1.68023733, -1.33489437,  0.79491436, ..., -0.44574006,
        -0.33614417,  0.48099775],
       [-1.58630015, -0.98107242, -0.12700578, ..., -0.44574006,
        -0.33614417,  0.48099775],
       ...,
       [-0.33227921, -0.36188399,  1.71683451, ...,  1.21062851,
         0.6396482 , -2.5461128 ],
       [-0.71583359,  0.08039346, -0.12700578, ..., -0.44574006,
        -0.33614417,  0.48099775],
       [ 0.99656847, -1.51180535,  0.79491436, ..., -0.44574006,
        -0.33614417,  0.48099775]])

In [86]:
x_train,x_test,y_train,y_test = train_test_split(scaled_data,y,test_size=0.3,random_state=42)

In [87]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [88]:
pred = lr.predict(x_train)
lr_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
print("Train Result:\n================================================")
print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{lr_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
print("_______________________________________________")
print(f'Recall Score: \n {recall_score(y_train, pred)}')
print("_______________________________________________")
print(f'Roc AUC Score: \n {roc_auc_score(y_train, pred)}')


Train Result:
Accuracy Score: 72.64%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.738858     0.643463  0.726385     0.691161      0.709178
recall        0.932324     0.270425  0.726385     0.601374      0.726385
f1-score      0.824393     0.380809  0.726385     0.602601      0.686379
support    6206.000000  2803.000000  0.726385  9009.000000   9009.000000
_______________________________________________
Confusion Matrix: 
 [[5786  420]
 [2045  758]]

_______________________________________________
Recall Score: 
 0.27042454513021763
_______________________________________________
Roc AUC Score: 
 0.6013740514887311


In [89]:
pred = lr.predict(x_test)
lr_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{lr_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
print("_______________________________________________")
print(f'Recall Score: \n {recall_score(y_test, pred)}')
print("_______________________________________________")
print(f'Roc AUC Score: \n {roc_auc_score(y_test, pred)}')

Test Result:
Accuracy Score: 73.61%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.748814     0.648262  0.736079     0.698538      0.718500
recall        0.936225     0.272337  0.736079     0.604281      0.736079
f1-score      0.832098     0.383545  0.736079     0.607821      0.696870
support    2697.000000  1164.000000  0.736079  3861.000000   3861.000000
_______________________________________________
Confusion Matrix: 
 [[2525  172]
 [ 847  317]]

_______________________________________________
Recall Score: 
 0.27233676975945015
_______________________________________________
Roc AUC Score: 
 0.6042811027143561


# GridSearchCV

In [90]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 2, 10, 100], 'penalty': ['l1', 'l2']}
scoring = {'Accuracy':'accuracy'}

gs = GridSearchCV(LogisticRegression(),return_train_score=True,param_grid=param_grid,scoring=scoring,cv=10,refit='Accuracy')
gs.fit(X,y)

results = gs.cv_results_

In [91]:
print('Best Parameters ',gs.best_estimator_)

Best Parameters  LogisticRegression(C=100)


In [92]:
print('Optimum Hiperparametreler: ',gs.best_params_)

Optimum Hiperparametreler:  {'C': 100, 'penalty': 'l2'}


In [93]:
print('Best Accuracy: ',gs.best_score_)

Best Accuracy:  0.6957264957264958
