In [319]:
import pandas as pd
import numpy as np
import sklearn
import random as rd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.combine import SMOTEENN

In [320]:
df = pd.read_csv('Creditcard_data.csv')

In [321]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [322]:
df.drop(['Time'], axis=1, inplace=True)

In [323]:
df.isna().sum().sum()

0

In [324]:
df.Class.value_counts()

0    763
1      9
Name: Class, dtype: int64

In [325]:
x = df.drop(['Class'], axis=1)
y = df.Class

In [326]:
smt = SMOTEENN(random_state=42)
x, y = smt.fit_resample(x, y)
print(y.value_counts())

1    707
0    604
Name: Class, dtype: int64


In [327]:
df_new = pd.concat([x, y], axis=1)
df_new.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
1,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
2,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,-0.099254,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
3,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
4,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [328]:
# drop 3 rows randomly from df
df_new.drop(df_new.sample(3).index, inplace=True)
df_new.shape

(1308, 30)

# Simple Random Sampling

In [329]:
df_sample_1 = df_new.sample(frac=0.4, axis=0, replace=True, random_state=42) 
df_sample_1.Class.value_counts()

1    311
0    212
Name: Class, dtype: int64

In [330]:
x_sample_1 = df_sample_1.drop(['Class'], axis=1)
y_sample_1 = df_sample_1.Class

In [331]:
y_sample_1.value_counts()

1    311
0    212
Name: Class, dtype: int64

In [334]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_1, y_sample_1, test_size=0.2, random_state=42, stratify=y_sample_1)

In [335]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.586687,0.555182,0.699443,0.578555,0.684773,0.141982,0.628662,0.735956,0.416045,0.253921,...,0.406216,0.230313,0.642044,0.632681,0.679739,0.707806,0.222018,0.371242,0.709957,0.00116
1,0.957734,0.616473,0.441962,0.603819,0.431748,0.152774,0.552854,0.727197,0.390734,0.288547,...,0.386207,0.171913,0.272429,0.700912,0.619613,0.669242,0.300516,0.40302,0.783293,0.002126
2,0.709503,0.634125,0.50251,0.530995,0.614395,0.265354,0.614293,0.752237,0.3954,0.299994,...,0.395488,0.189485,0.395813,0.686568,0.224119,0.096421,0.301201,0.442677,0.814686,0.001058
3,0.696974,0.720698,0.651749,0.534681,0.448942,0.098778,0.700837,0.727038,0.351133,0.279208,...,0.426822,0.176144,0.313789,0.66682,0.748427,0.503486,0.291452,0.499537,0.829353,0.001376
4,0.816722,0.625695,0.548609,0.59313,0.495913,0.084418,0.643704,0.711574,0.400992,0.267956,...,0.381074,0.189878,0.390423,0.663843,0.748699,0.711438,0.242436,0.388802,0.756118,0.001245


In [336]:
#knn
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 2, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        43
           1       0.94      1.00      0.97        62

    accuracy                           0.96       105
   macro avg       0.97      0.95      0.96       105
weighted avg       0.96      0.96      0.96       105



In [337]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.84      0.91        43
           1       0.90      1.00      0.95        62

    accuracy                           0.93       105
   macro avg       0.95      0.92      0.93       105
weighted avg       0.94      0.93      0.93       105



420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic

In [338]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        43
           1       0.93      1.00      0.96        62

    accuracy                           0.95       105
   macro avg       0.96      0.94      0.95       105
weighted avg       0.96      0.95      0.95       105



In [339]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'entropy', 'max_depth': 10}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        62

    accuracy                           1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [340]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 6, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.98        62

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



# Systematic Sampling

In [341]:
def systematic_sampling(df_func, step):
 
    indexes = np.arange(0, len(df_func), step=step)
    systematic_sample = df_func.iloc[indexes]
    return systematic_sample

In [219]:
df_sample_2 = systematic_sampling(df_new, 3)
df_sample_2.Class.value_counts()

1    235
0    201
Name: Class, dtype: int64

In [220]:
x_sample_2 = df_sample_2.drop(['Class'], axis=1)
y_sample_2 = df_sample_2.Class

In [221]:
y_sample_2.value_counts()

1    235
0    201
Name: Class, dtype: int64

In [342]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_2, y_sample_2, test_size=0.2, random_state=42, stratify=y_sample_2)

In [343]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.760536,0.677072,0.354227,0.527243,0.940501,0.7361,0.592223,0.903964,0.268696,0.265829,...,0.247657,0.524421,0.599429,0.456833,0.959151,0.245906,0.159799,0.545809,0.833687,0.008748
1,0.322873,0.521252,0.734495,0.884474,0.0,1.0,0.904444,0.809207,0.418196,0.303292,...,0.247266,0.472054,0.731565,0.291133,0.163183,0.753094,0.420323,0.854622,0.723091,0.82108
2,0.908323,0.645558,0.538307,0.607285,0.508021,0.101133,0.554474,0.826008,0.335701,0.216417,...,0.20525,0.485376,0.275505,0.505658,0.633806,0.540034,0.297615,0.532648,0.870088,0.001088
3,0.427833,0.411388,0.739283,0.621502,0.782368,0.300632,0.416749,0.902318,0.399486,0.217313,...,0.296297,0.542924,0.702083,0.548209,0.235142,0.274354,0.403683,0.53398,0.848293,0.001131
4,0.024379,0.143152,0.653479,0.801395,0.720811,0.070314,0.576087,0.835855,0.303695,0.16709,...,0.674937,0.600501,0.6621,1.0,0.505634,0.666691,0.24205,0.385573,0.861668,0.460772


In [344]:
#knn
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 2, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        41
           1       0.94      1.00      0.97        47

    accuracy                           0.97        88
   macro avg       0.97      0.96      0.97        88
weighted avg       0.97      0.97      0.97        88



In [345]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        41
           1       0.94      1.00      0.97        47

    accuracy                           0.97        88
   macro avg       0.97      0.96      0.97        88
weighted avg       0.97      0.97      0.97        88



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the f

In [346]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        41
           1       0.90      1.00      0.95        47

    accuracy                           0.94        88
   macro avg       0.95      0.94      0.94        88
weighted avg       0.95      0.94      0.94        88



In [347]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'entropy', 'max_depth': 6}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        47

    accuracy                           1.00        88
   macro avg       1.00      1.00      1.00        88
weighted avg       1.00      1.00      1.00        88



In [348]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 6, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        41
           1       0.96      1.00      0.98        47

    accuracy                           0.98        88
   macro avg       0.98      0.98      0.98        88
weighted avg       0.98      0.98      0.98        88



# Stratified Sampling

In [355]:
df_stratified = df_new.groupby('Class', group_keys=False).apply(lambda x: x.sample(frac=0.4))
display(df_stratified)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
153,1.353283,-0.694900,0.970826,-0.426559,-1.417263,-0.505710,-0.932069,-0.083957,-0.224208,0.446748,...,0.284341,0.993434,-0.164893,0.474257,0.623848,-0.006750,0.046026,0.022043,15.930000,0
296,-0.586190,0.490033,0.564065,-1.228364,2.860178,3.363731,0.263669,0.703249,-0.648518,-0.338281,...,-0.295026,-0.965887,-0.207535,0.989880,0.257821,0.114458,-0.159428,-0.158921,1.980000,0
239,1.081027,-0.139455,0.483881,0.642057,-0.186845,0.538283,-0.302749,0.315920,0.277328,-0.102329,...,-0.124039,-0.190064,0.057896,-0.269354,0.253835,0.311886,0.001591,-0.003468,17.240000,0
352,-0.386633,0.953379,1.851726,1.623108,-0.603151,0.204916,0.438894,-0.313317,0.359461,0.585055,...,0.141432,0.996964,-0.205808,0.481312,-0.224335,-0.105636,-0.104202,-0.070726,55.760000,0
234,-0.342871,-0.199546,1.976353,-0.003495,-1.170366,0.883501,-0.151879,0.160106,0.137973,-0.060122,...,-0.313443,0.086207,0.109600,-0.098951,-0.943009,-0.618657,0.253306,0.240271,99.820000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,-1.122530,0.315317,1.608206,-0.018686,0.955408,-0.754849,0.732690,-0.085285,-0.102393,-0.452815,...,0.007361,0.167803,-0.258905,0.235956,0.216908,-0.287316,-0.162189,-0.187613,1.000000,1
737,-1.208060,0.198625,0.971822,-0.430746,0.892026,-0.132735,0.277057,0.140299,-0.019431,-0.088719,...,-0.164676,-0.247632,-0.140997,-0.567279,-0.643631,0.192042,-0.176237,-0.203880,0.997072,1
772,-1.511728,-1.777445,1.992169,0.898448,2.075696,0.513471,-1.542990,0.635418,0.703824,-0.284789,...,0.292536,0.823631,0.263231,-0.976568,-0.495024,0.675858,-0.040628,-0.076493,1.379797,1
1176,-1.486835,0.077763,1.112666,-0.543139,0.811960,-0.357256,0.253752,0.111746,0.024447,-0.084590,...,-0.188632,-0.245913,-0.229084,-0.188960,-0.288703,0.228076,-0.318069,-0.348708,1.142126,1


In [356]:
x_sample_3 = df_stratified.drop(['Class'], axis=1)
y_sample_3 = df_stratified.Class

In [357]:
y_sample_3.value_counts()

1    282
0    241
Name: Class, dtype: int64

In [358]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_sample_3 = label_encoder.fit_transform(y_sample_3)

In [365]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_3, y_sample_3, test_size=0.2, random_state=42, stratify=y_sample_3)

In [366]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.875056,0.474827,0.539289,0.451508,0.691024,0.253982,0.408965,0.77391,0.385651,0.270119,...,0.286517,0.229109,0.388503,0.313607,0.496767,0.603997,0.249635,0.248718,0.355573,0.002634
1,0.817243,0.471534,0.558997,0.427374,0.705134,0.248947,0.417278,0.774971,0.391813,0.271632,...,0.288824,0.230342,0.405784,0.287857,0.501644,0.575998,0.25359,0.235306,0.334143,0.002385
2,0.927931,0.481604,0.519926,0.474869,0.682866,0.265213,0.405221,0.773362,0.380397,0.268595,...,0.284356,0.229392,0.374922,0.346055,0.458556,0.578912,0.24385,0.270795,0.388347,0.002758
3,0.720878,0.484748,0.668423,0.110537,0.625066,0.106531,0.455215,0.751013,0.24872,0.310988,...,0.331215,0.308302,0.6951,0.196905,0.615426,0.50058,0.113978,0.373859,0.45455,0.004295
4,0.923436,0.488737,0.553927,0.511843,0.636801,0.087384,0.440904,0.732644,0.423225,0.252626,...,0.280485,0.221726,0.341331,0.339687,0.731208,0.663216,0.224571,0.258034,0.381486,0.002761


In [367]:
#knn
knn_params = {
    'n_neighbors':range(3,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 4, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        48
           1       0.90      1.00      0.95        57

    accuracy                           0.94       105
   macro avg       0.95      0.94      0.94       105
weighted avg       0.95      0.94      0.94       105



In [368]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        48
           1       0.95      1.00      0.97        57

    accuracy                           0.97       105
   macro avg       0.97      0.97      0.97       105
weighted avg       0.97      0.97      0.97       105



420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic

In [369]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        48
           1       0.93      1.00      0.97        57

    accuracy                           0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



In [370]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

In [None]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'log_loss', 'max_depth': 13, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       139
           1       0.99      1.00      1.00       123

    accuracy                           1.00       262
   macro avg       1.00      1.00      1.00       262
weighted avg       1.00      1.00      1.00       262



# Cluster Sampling

In [None]:
def sample_cluster(dataframe, clusters, state = None):
    
    print('define variables')
    length = len(dataframe)
    print(f'  - length: {length}')
    element_max = length / clusters
    print(f'  - elements by cluster: {element_max}')
    
    cluster_list = []
    cluster_id = 0
    element_count = 0
    
    print('define clusters')
    for _ in df.iterrows():
        cluster_list.append(cluster_id)
        element_count += 1
        if element_count > (element_max - 1):
            element_count = 0
            cluster_id += 1
    
    dataframe['cluster'] = cluster_list
    print(' - cluster list')
    print(dataframe['cluster'].value_counts())
    print('')
    rd.seed(state)
    cluster_selected = rd.randint(0, clusters - 1)
    print('cluster selected:',cluster_selected)
    dataframe_clustered = dataframe[dataframe['cluster'] == cluster_selected]
    print('cluster size:',dataframe_clustered.shape[0],'\n')
    return dataframe_clustered

In [None]:
df_1 = df_new.sample(frac = 1)
df_1.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
179,-0.546215,0.878084,1.523176,-0.242853,0.323931,0.371852,0.446595,0.370077,-0.631359,-0.22907,...,-0.120936,-0.268618,0.036313,-0.312671,-0.378051,0.11366,0.292121,0.10373,6.45,0
382,-0.424666,0.554293,1.374234,-0.447451,0.288248,-1.082536,0.969035,-0.347446,-0.109678,-0.258587,...,-0.023296,0.103502,0.024513,0.425008,-0.227746,0.182264,0.07672,-0.098667,41.64,0
126,1.000234,-0.295227,1.308259,1.257753,-0.967532,0.460245,-0.78185,0.424415,0.790908,-0.054855,...,0.03273,0.179644,0.028786,0.203489,0.234992,-0.408459,0.072615,0.028892,35.97,0
346,1.202007,0.108213,0.591724,0.566079,-0.641573,-0.816974,-0.127403,0.004375,-0.026148,0.142131,...,-0.207683,-0.703844,0.16535,0.493736,0.114822,0.068947,-0.04135,0.008041,1.79,0
1147,-1.458732,0.080556,1.104362,-0.530468,0.79983,-0.364564,0.25204,0.108522,0.024557,-0.08662,...,-0.189646,-0.251927,-0.225423,-0.183544,-0.283531,0.226709,-0.315048,-0.344817,1.157992,1


In [None]:
df_sample_4 = sample_cluster(df_1, 4, 42)
df_sample_4.Class.value_counts()

define variables
  - length: 1308
  - elements by cluster: 327.0
define clusters
 - cluster list
0    327
1    327
2    327
3    327
Name: cluster, dtype: int64

cluster selected: 0
cluster size: 327 



1    173
0    154
Name: Class, dtype: int64

In [None]:
df_sample_4.drop(['cluster'], axis=1, inplace=True)
df_sample_4.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample_4.drop(['cluster'], axis=1, inplace=True)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
179,-0.546215,0.878084,1.523176,-0.242853,0.323931,0.371852,0.446595,0.370077,-0.631359,-0.22907,...,-0.120936,-0.268618,0.036313,-0.312671,-0.378051,0.11366,0.292121,0.10373,6.45,0
382,-0.424666,0.554293,1.374234,-0.447451,0.288248,-1.082536,0.969035,-0.347446,-0.109678,-0.258587,...,-0.023296,0.103502,0.024513,0.425008,-0.227746,0.182264,0.07672,-0.098667,41.64,0
126,1.000234,-0.295227,1.308259,1.257753,-0.967532,0.460245,-0.78185,0.424415,0.790908,-0.054855,...,0.03273,0.179644,0.028786,0.203489,0.234992,-0.408459,0.072615,0.028892,35.97,0
346,1.202007,0.108213,0.591724,0.566079,-0.641573,-0.816974,-0.127403,0.004375,-0.026148,0.142131,...,-0.207683,-0.703844,0.16535,0.493736,0.114822,0.068947,-0.04135,0.008041,1.79,0
1147,-1.458732,0.080556,1.104362,-0.530468,0.79983,-0.364564,0.25204,0.108522,0.024557,-0.08662,...,-0.189646,-0.251927,-0.225423,-0.183544,-0.283531,0.226709,-0.315048,-0.344817,1.157992,1


In [None]:
x_sample_4 = df_sample_4.drop(['Class'], axis=1)
y_sample_4 = df_sample_4.Class

In [None]:
y_sample_4.value_counts()

1    173
0    154
Name: Class, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_4, y_sample_4, test_size=0.2, random_state=42, stratify=y_sample_4)

In [None]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.133474,0.676077,0.58147,0.651642,0.422481,0.715741,0.194736,0.584736,0.485675,0.323487,...,0.143243,0.681342,0.123947,0.241204,0.004699,0.678788,0.20094,0.415467,0.7258,0.009208
1,0.492422,0.343814,0.810098,0.102075,0.475724,0.232539,0.495792,0.626536,0.667876,0.058118,...,0.211213,0.413455,0.963369,0.154358,0.52646,0.580464,0.034631,0.519362,0.576003,0.083224
2,0.947607,0.33668,0.545567,0.565112,0.342339,0.363537,0.289979,0.657161,0.286957,0.420519,...,0.088527,0.236015,0.40428,0.160866,0.292353,0.766373,0.217478,0.53593,0.64957,0.064208
3,0.318441,0.469341,0.853564,0.527957,0.397398,0.341545,0.410628,0.659933,0.443603,0.348181,...,0.128861,0.358416,0.776113,0.341009,0.62938,0.450188,0.34514,0.363967,0.775866,0.040984
4,0.972764,0.320578,0.583335,0.433951,0.322008,0.346539,0.248024,0.638461,0.33521,0.366869,...,0.244479,0.422357,0.915732,0.138585,0.320253,0.784981,0.283691,0.539352,0.646293,0.027309


In [None]:
#knn
knn_params = {
    'n_neighbors':range(3,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 4, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.89      0.94        28
           1       0.93      1.00      0.96        38

    accuracy                           0.95        66
   macro avg       0.96      0.95      0.95        66
weighted avg       0.96      0.95      0.95        66



In [None]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        28
           1       0.95      1.00      0.97        38

    accuracy                           0.97        66
   macro avg       0.97      0.96      0.97        66
weighted avg       0.97      0.97      0.97        66



420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic

In [None]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        28
           1       0.95      1.00      0.97        38

    accuracy                           0.97        66
   macro avg       0.97      0.96      0.97        66
weighted avg       0.97      0.97      0.97        66



In [None]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 11}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        38

    accuracy                           1.00        66
   macro avg       1.00      1.00      1.00        66
weighted avg       1.00      1.00      1.00        66



In [None]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 12, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       0.96      0.86      0.91        28
           1       0.90      0.97      0.94        38

    accuracy                           0.92        66
   macro avg       0.93      0.92      0.92        66
weighted avg       0.93      0.92      0.92        66

