In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('data/data.csv')


In [3]:
df.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
X = df.iloc[:,:-1]


In [5]:
X.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [6]:
y = df.iloc[:,[-1]]


In [7]:
y.head()

Unnamed: 0,default payment next month
0,1
1,1
2,0
3,0
4,0


In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=1)


In [9]:
X_train.shape


(22500, 23)

In [10]:
y_train.shape


(22500, 1)

In [11]:
X_test.shape

(7500, 23)

In [12]:
y_test.shape

(7500, 1)

In [13]:
numerical_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

In [14]:
numerical_pipeline

In [15]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,list(X_train.columns))
])


In [16]:
preprocessor

In [17]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train))#,columns=preprocessor.get_feature_names_out)


In [18]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,-0.132156,0.810010,-1.081119,0.862079,-1.032242,0.011356,0.107986,1.802923,0.185591,0.230396,...,0.239703,0.219205,0.219100,0.163152,-0.046952,-0.176613,-0.180321,-0.175818,-0.181735,-0.211992
1,-0.904811,0.810010,0.180364,0.862079,-1.358135,0.011356,0.107986,0.136225,0.185591,0.230396,...,-0.067668,-0.567484,-0.343920,-0.490297,-0.218003,-0.172025,-0.281587,-0.111057,-0.300922,-0.300573
2,-0.673015,-1.234552,0.180364,0.862079,-0.489089,0.011356,0.107986,0.136225,0.185591,0.230396,...,0.374575,0.537084,0.639536,0.592541,-0.157985,-0.109147,0.109839,-0.104581,-0.115849,-0.123410
3,-0.363953,0.810010,0.180364,0.862079,0.379956,0.011356,0.107986,1.802923,0.185591,0.230396,...,-0.328130,-0.287907,-0.239381,-0.205831,-0.193996,-0.221590,-0.238353,-0.208199,-0.214678,-0.211992
4,-0.132156,0.810010,-1.081119,2.779516,-0.597720,0.895496,-0.723811,1.802923,-0.666960,-0.648084,...,-0.589397,-0.639264,-0.662492,-0.627444,-0.338039,0.119741,-0.151305,-0.305341,-0.222584,-0.300573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22495,0.022375,0.810010,1.441847,-1.055357,0.488587,0.895496,1.771580,0.136225,0.185591,0.230396,...,-0.383078,-0.368321,-0.576513,-0.650450,-0.338039,-0.176613,-0.230461,-0.240580,-0.313506,1.865481
22496,1.258623,0.810010,-1.081119,-1.055357,0.597218,-0.872784,-0.723811,-1.530474,-1.519512,-1.526564,...,-0.684959,-0.673116,-0.662377,-0.650450,-0.338039,-0.266567,-0.296385,-0.305341,-0.313506,-0.300573
22497,-0.904811,-1.234552,0.180364,0.862079,-1.466765,0.011356,0.107986,0.136225,0.185591,0.230396,...,-0.165821,-0.364293,-0.333303,-0.318850,-0.235589,-0.207152,-0.256517,-0.259684,-0.260073,-0.248960
22498,-0.673015,-1.234552,-1.081119,0.862079,-0.054566,-0.872784,-0.723811,-0.697124,-0.666960,-0.648084,...,-0.584698,-0.646274,-0.639623,-0.316303,-0.098987,0.041661,-0.196628,-0.216100,1.000384,-0.157780


In [19]:
X_test = pd.DataFrame(preprocessor.transform(X_test))


In [20]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1.104092,-1.234552,1.441847,-1.055357,-0.380458,0.011356,0.107986,0.136225,0.185591,0.230396,...,0.931033,0.644324,0.291688,-0.413136,0.159810,0.003296,-0.064257,-0.111057,-0.247620,-0.182465
1,-1.213873,0.810010,1.441847,-1.055357,1.466263,-0.872784,-0.723811,-1.530474,-0.666960,1.987357,...,-0.695910,-0.646727,-0.643586,-0.634867,-0.338039,-0.266567,-0.132270,-0.305341,-0.301515,-0.300573
2,-0.904811,-1.234552,0.180364,-1.055357,-0.814981,-0.872784,-0.723811,-0.697124,0.185591,-0.648084,...,-0.007714,0.044760,-0.640911,0.086594,-0.338039,1.813939,-0.168715,-0.221151,2.584603,-0.218724
3,-0.673015,0.810010,1.441847,-1.055357,1.792155,1.779636,1.771580,2.636272,2.743245,2.865837,...,-0.097843,-0.036857,-0.004923,0.029959,-0.115973,-0.194603,-0.203534,-0.305341,-0.208089,-0.206086
4,0.795030,-1.234552,-1.081119,0.862079,-0.163197,0.895496,1.771580,0.136225,0.185591,1.987357,...,-0.378612,-0.322602,-0.406601,-0.359852,-0.338039,-0.086658,-0.180321,-0.305341,-0.181735,-0.182465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,-1.059342,-1.234552,1.441847,0.862079,-1.032242,0.011356,0.107986,0.136225,0.185591,0.230396,...,-0.307355,-0.376878,-0.199190,-0.302228,-0.218003,-0.176613,-0.203534,0.342272,-0.172511,-0.300573
7496,-0.904811,0.810010,-1.081119,0.862079,-1.032242,0.011356,0.107986,0.136225,0.185591,0.230396,...,-0.408084,-0.359421,-0.326615,-0.300988,-0.242011,-0.190106,-0.191927,-0.248675,-0.247620,-0.023018
7497,1.722216,0.810010,-1.081119,0.862079,-0.271828,-1.756924,-1.555608,-1.530474,-1.519512,-1.526564,...,-0.684959,-0.673116,-0.636617,-0.576959,-0.277001,-0.266567,-0.296385,-0.204313,-0.024532,0.408079
7498,-0.286687,-1.234552,1.441847,2.779516,0.488587,1.779636,0.107986,0.136225,0.185591,0.230396,...,0.556913,0.669261,0.768316,0.816970,-0.127076,-0.126823,-0.006051,-0.097910,-0.085805,-0.064356


In [21]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.linear_model import  LogisticRegression
from sklearn.ensemble import  RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [22]:
models = {
    'LogisticRegression':LogisticRegression(),
    'DecisionTree':DecisionTreeClassifier(),
    'SVM':SVC(),
    'NaiveBayes':GaussianNB(),
    'KNN':KNeighborsClassifier(),
    'RandomForest':RandomForestClassifier()
}


In [30]:
def evaluate_model(models,X_train,X_test,y_train,y_test):
    reports = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        confusion_met = confusion_matrix(y_test,y_pred)

        true_positive = confusion_met[0][0]
        false_positive = confusion_met[0][1]
        false_negative = confusion_met[1][0]
        true_negative = confusion_met[1][1]

        accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)



        reports[list(models.keys())[i]] = accuracy
        
    print(reports)

    best_model_score = max(list(reports.values()))

    best_model_name = list(reports.keys())[list(reports.values()).index(best_model_score)]

    return (f'Best Model Name:{best_model_name},Best Model Score : {best_model_score}')


In [31]:
a = evaluate_model(models,X_train,X_test,y_train,y_test)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


{'LogisticRegression': 0.8074666666666667, 'DecisionTree': 0.7202666666666667, 'SVM': 0.8181333333333334, 'NaiveBayes': 0.7456, 'KNN': 0.7937333333333333, 'RandomForest': 0.8138666666666666}


In [25]:
a

'Best Model Name:SVM,Best Model Score : 0.8181333333333334'