In [1]:
import pandas as pd
import numpy as np
import sklearn
import random as rd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.combine import SMOTEENN

In [2]:
df = pd.read_csv('Creditcard_data.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.drop(['Time'], axis=1, inplace=True)

In [5]:
df.isna().sum().sum()

0

In [6]:
df.Class.value_counts()

0    763
1      9
Name: Class, dtype: int64

In [7]:
x = df.drop(['Class'], axis=1)
y = df.Class

In [8]:
smt = SMOTEENN(random_state=42)
x, y = smt.fit_resample(x, y)
print(y.value_counts())

1    707
0    604
Name: Class, dtype: int64


In [9]:
df_new = pd.concat([x, y], axis=1)
df_new.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
1,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
2,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,-0.099254,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
3,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
4,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [10]:
# drop 3 rows randomly from df
df_new.drop(df_new.sample(3).index, inplace=True)
df_new.shape

(1308, 30)

# Simple Random Sampling

In [11]:
df_sample_1 = df_new.sample(frac=0.4, axis=0, replace=True, random_state=42) 
df_sample_1.Class.value_counts()

1    311
0    212
Name: Class, dtype: int64

In [12]:
x_sample_1 = df_sample_1.drop(['Class'], axis=1)
y_sample_1 = df_sample_1.Class

In [13]:
y_sample_1.value_counts()

1    311
0    212
Name: Class, dtype: int64

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_1, y_sample_1, test_size=0.2, random_state=42, stratify=y_sample_1)

In [15]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.615313,0.436111,0.726514,0.578555,0.703345,0.134225,0.494412,0.735956,0.445687,0.243536,...,0.378532,0.227777,0.651948,0.311178,0.67892,0.683965,0.222018,0.189363,0.410777,0.00116
1,0.960612,0.484257,0.492225,0.603819,0.511035,0.144427,0.434792,0.727197,0.418572,0.276746,...,0.361179,0.17002,0.29256,0.425207,0.618866,0.646699,0.300516,0.218196,0.464726,0.002126
2,0.68408,0.390228,0.624051,0.592677,0.726607,0.287941,0.340824,0.7803,0.477874,0.278781,...,0.415772,0.217987,0.568043,0.461587,0.194932,0.161933,0.356687,0.256587,0.487996,0.001245
3,0.717947,0.566128,0.683116,0.534681,0.524104,0.093381,0.551174,0.727038,0.37615,0.267789,...,0.396403,0.174204,0.332776,0.368232,0.747525,0.486526,0.291452,0.305769,0.498608,0.001376
4,0.829386,0.491501,0.589266,0.59313,0.559803,0.079806,0.506241,0.711574,0.429562,0.256997,...,0.356727,0.187787,0.407289,0.363257,0.747797,0.687475,0.242436,0.205296,0.444734,0.001245


In [16]:
#knn
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 1, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        43
           1       0.94      1.00      0.97        62

    accuracy                           0.96       105
   macro avg       0.97      0.95      0.96       105
weighted avg       0.96      0.96      0.96       105



In [17]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        43
           1       0.93      1.00      0.96        62

    accuracy                           0.95       105
   macro avg       0.96      0.94      0.95       105
weighted avg       0.96      0.95      0.95       105



In [18]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 3, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        43
           1       0.94      1.00      0.97        62

    accuracy                           0.96       105
   macro avg       0.97      0.95      0.96       105
weighted avg       0.96      0.96      0.96       105



In [19]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'log_loss', 'max_depth': 13}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        62

    accuracy                           1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [20]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'log_loss', 'max_depth': 7, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.98        62

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



# Systematic Sampling

In [21]:
def systematic_sampling(df_func, step):
 
    indexes = np.arange(0, len(df_func), step=step)
    systematic_sample = df_func.iloc[indexes]
    return systematic_sample

In [22]:
df_sample_2 = systematic_sampling(df_new, 3)
df_sample_2.Class.value_counts()

1    235
0    201
Name: Class, dtype: int64

In [23]:
x_sample_2 = df_sample_2.drop(['Class'], axis=1)
y_sample_2 = df_sample_2.Class

In [24]:
y_sample_2.value_counts()

1    235
0    201
Name: Class, dtype: int64

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_2, y_sample_2, test_size=0.2, random_state=42, stratify=y_sample_2)

In [26]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.836558,0.469852,0.193732,0.39421,1.0,0.918914,0.514322,0.856659,0.349405,0.335,...,0.303269,0.178414,0.650904,0.277497,0.985774,0.252436,0.091453,0.497163,0.424317,0.013648
1,0.935695,0.311596,0.485101,0.325167,0.260928,0.175853,0.353223,0.786402,0.738585,0.17344,...,0.255093,0.199846,0.80245,0.197652,0.789939,0.821084,0.083492,0.506833,0.456799,0.094262
2,0.956556,0.427103,0.38543,0.476686,0.475379,0.242006,0.423354,0.786884,0.402144,0.281872,...,0.255079,0.134037,0.38145,0.321383,0.529396,0.638816,0.390783,0.486286,0.449308,0.003675
3,0.783048,0.435131,0.54072,0.451697,0.573656,0.174616,0.510764,0.771962,0.407236,0.258727,...,0.248685,0.160401,0.539953,0.22848,0.648134,0.687536,0.236648,0.475775,0.421332,0.002401
4,0.768528,0.456541,0.445751,0.415311,0.622508,0.317758,0.484161,0.799088,0.420188,0.286514,...,0.262485,0.14965,0.46933,0.32731,0.168156,0.032553,0.374422,0.526253,0.506909,0.001354


In [27]:
#knn
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 2, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        41
           1       0.96      1.00      0.98        47

    accuracy                           0.98        88
   macro avg       0.98      0.98      0.98        88
weighted avg       0.98      0.98      0.98        88



In [28]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        41
           1       0.89      1.00      0.94        47

    accuracy                           0.93        88
   macro avg       0.94      0.93      0.93        88
weighted avg       0.94      0.93      0.93        88



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [29]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 3, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        41
           1       0.94      1.00      0.97        47

    accuracy                           0.97        88
   macro avg       0.97      0.96      0.97        88
weighted avg       0.97      0.97      0.97        88



In [30]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 9}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        47

    accuracy                           1.00        88
   macro avg       1.00      1.00      1.00        88
weighted avg       1.00      1.00      1.00        88



In [31]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 8, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        41
           1       0.98      1.00      0.99        47

    accuracy                           0.99        88
   macro avg       0.99      0.99      0.99        88
weighted avg       0.99      0.99      0.99        88



# Stratified Sampling

In [32]:
df_stratified = df_new.groupby('Class', group_keys=False).apply(lambda x: x.sample(frac=0.4))
display(df_stratified)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
337,1.182238,-0.147518,1.248518,0.862034,-1.076728,-0.299690,-0.614840,0.064575,0.907341,-0.244554,...,-0.090194,-0.057346,0.030121,0.438310,0.259808,0.315896,0.017962,0.027611,8.580000,0
478,-0.386726,0.172565,0.732583,-1.434662,0.033065,-1.043657,0.720715,-0.150763,0.422942,-1.225085,...,0.143196,0.402446,-0.020879,0.121543,-0.194311,0.114302,0.114048,0.135687,55.000000,0
376,1.135629,-0.173986,0.730692,0.711558,-0.854209,-0.494951,-0.296620,0.110536,0.490269,-0.019984,...,-0.243930,-0.682700,0.096611,0.519267,0.201128,0.259544,-0.039581,0.007195,23.880000,0
592,1.166360,0.005061,0.497768,0.798920,-0.365524,-0.233421,-0.074210,-0.008325,0.437687,-0.247289,...,-0.195728,-0.365798,0.030729,0.123133,0.381749,0.296735,-0.007175,0.011905,18.560000,0
510,-0.239505,-3.940241,-0.147576,-0.671347,-2.239256,0.908178,-0.377398,0.157943,-1.595928,0.987881,...,0.076296,-1.132178,-0.486820,-0.302911,-0.304121,-0.469811,-0.077517,0.151745,834.840000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,-0.468372,0.468345,1.149384,0.151595,0.900549,-0.372026,0.611564,-0.020689,-0.133366,-0.376419,...,-0.023201,-0.023074,-0.070250,-0.417549,-0.425973,-0.210926,0.049863,0.037605,0.995410,1
727,0.366813,0.206066,0.453816,0.137104,0.302315,-0.141324,0.019142,0.100736,-0.176906,-0.138660,...,-0.212760,-0.512925,-0.000959,-0.311025,0.025777,0.158111,-0.103623,-0.097411,2.214193,1
998,0.809912,0.425902,0.358763,0.475344,0.078428,-0.572401,0.165707,-0.073890,-0.024436,-0.229884,...,-0.227765,-0.662899,0.136720,-0.302186,-0.388616,0.087437,0.072735,0.105954,2.049665,1
1132,-1.137531,1.429704,-0.979134,2.909645,-0.468093,-1.308395,-1.671436,0.864531,-1.853831,-1.954165,...,0.252646,-0.293555,-0.270125,0.327743,0.102630,0.150381,0.167914,-0.085949,0.424476,1


In [33]:
x_sample_3 = df_stratified.drop(['Class'], axis=1)
y_sample_3 = df_stratified.Class

In [34]:
y_sample_3.value_counts()

1    282
0    241
Name: Class, dtype: int64

In [35]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_sample_3 = label_encoder.fit_transform(y_sample_3)

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_3, y_sample_3, test_size=0.2, random_state=42, stratify=y_sample_3)

In [37]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.465225,0.893013,0.284475,0.969499,0.564767,0.052561,0.030439,0.943527,0.026386,0.016798,...,0.339335,0.306273,0.504482,0.535213,0.733877,0.601763,0.285303,0.638515,0.808977,8.8e-05
1,0.733675,0.6777,0.678538,0.565819,0.685866,0.135049,0.47272,0.754402,0.326012,0.198539,...,0.29799,0.234931,0.518178,0.600139,0.751938,0.71935,0.172005,0.560784,0.824983,0.000953
2,0.749185,0.652114,0.595674,0.532046,0.650191,0.181488,0.403748,0.771334,0.339821,0.225057,...,0.310882,0.206836,0.368785,0.619734,0.643159,0.559649,0.283379,0.537417,0.797858,0.001009
3,0.6719,0.812039,0.716905,0.49148,0.69009,0.23682,0.595513,0.673075,0.428595,0.345645,...,0.460839,0.163665,0.366615,0.606674,0.623978,0.500623,0.243411,0.564081,0.656408,0.01401
4,0.568633,0.507923,0.781491,0.592502,0.797979,0.253629,0.332063,0.805694,0.373351,0.199395,...,0.35142,0.270667,0.73973,0.624198,0.580798,0.63985,0.259509,0.552743,0.804636,0.001049


In [38]:
#knn
knn_params = {
    'n_neighbors':range(3,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 4, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        48
           1       0.92      1.00      0.96        57

    accuracy                           0.95       105
   macro avg       0.96      0.95      0.95       105
weighted avg       0.96      0.95      0.95       105



In [39]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        48
           1       0.93      1.00      0.97        57

    accuracy                           0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the f

In [40]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        48
           1       0.97      1.00      0.98        57

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



In [41]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 12}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        48
           1       0.98      1.00      0.99        57

    accuracy                           0.99       105
   macro avg       0.99      0.99      0.99       105
weighted avg       0.99      0.99      0.99       105



In [42]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 7, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        48
           1       0.98      1.00      0.99        57

    accuracy                           0.99       105
   macro avg       0.99      0.99      0.99       105
weighted avg       0.99      0.99      0.99       105



# Cluster Sampling

In [43]:
def sample_cluster(dataframe, clusters, state = None):
    
    print('define variables')
    length = len(dataframe)
    print(f'  - length: {length}')
    element_max = length / clusters
    print(f'  - elements by cluster: {element_max}')
    
    cluster_list = []
    cluster_id = 0
    element_count = 0
    
    print('define clusters')
    for _ in df.iterrows():
        cluster_list.append(cluster_id)
        element_count += 1
        if element_count > (element_max - 1):
            element_count = 0
            cluster_id += 1
    
    dataframe['cluster'] = cluster_list
    print(' - cluster list')
    print(dataframe['cluster'].value_counts())
    print('')
    rd.seed(state)
    cluster_selected = rd.randint(0, clusters - 1)
    print('cluster selected:',cluster_selected)
    dataframe_clustered = dataframe[dataframe['cluster'] == cluster_selected]
    print('cluster size:',dataframe_clustered.shape[0],'\n')
    return dataframe_clustered

In [44]:
df_1 = df_new.sample(frac = 1)
df_1.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
255,-1.142321,0.626405,2.526917,2.827973,0.619263,0.897473,0.536278,-0.060163,-0.813749,1.426859,...,-0.309746,-0.269173,0.177396,-0.019578,0.048651,0.068831,-0.246503,-0.230837,10.62,0
589,0.186118,-0.159358,-1.389222,-2.433996,1.753378,3.593082,-1.582165,-1.903514,-1.314805,-0.327551,...,-1.084401,0.656716,-0.139815,1.036164,0.804413,-0.24209,0.103552,0.27618,48.0,0
211,-0.278288,0.866214,-0.30062,-1.117345,2.542242,3.270646,0.117003,0.934686,-0.503871,-0.445162,...,-0.320542,-1.005672,-0.055449,0.93579,-0.007564,0.088109,0.248043,0.090837,0.89,0
1251,1.255579,0.361536,0.30236,0.68276,-0.344218,-1.035476,0.088649,-0.200703,0.005687,-0.282145,...,-0.284872,-0.814745,0.126806,0.320939,0.219367,0.095415,-0.02176,0.030417,1.335484,1
169,-2.420413,1.947885,0.553646,0.983069,-0.281518,2.408958,-1.401613,-0.188299,0.675878,0.158497,...,1.213826,-1.23862,0.006927,-1.724222,0.239603,-0.313703,-0.188281,0.119831,6.0,0


In [None]:
df_sample_4 = sample_cluster(df_1, 4, 42)
df_sample_4.Class.value_counts()

In [None]:
df_sample_4.drop(['cluster'], axis=1, inplace=True)
df_sample_4.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample_4.drop(['cluster'], axis=1, inplace=True)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
179,-0.546215,0.878084,1.523176,-0.242853,0.323931,0.371852,0.446595,0.370077,-0.631359,-0.22907,...,-0.120936,-0.268618,0.036313,-0.312671,-0.378051,0.11366,0.292121,0.10373,6.45,0
382,-0.424666,0.554293,1.374234,-0.447451,0.288248,-1.082536,0.969035,-0.347446,-0.109678,-0.258587,...,-0.023296,0.103502,0.024513,0.425008,-0.227746,0.182264,0.07672,-0.098667,41.64,0
126,1.000234,-0.295227,1.308259,1.257753,-0.967532,0.460245,-0.78185,0.424415,0.790908,-0.054855,...,0.03273,0.179644,0.028786,0.203489,0.234992,-0.408459,0.072615,0.028892,35.97,0
346,1.202007,0.108213,0.591724,0.566079,-0.641573,-0.816974,-0.127403,0.004375,-0.026148,0.142131,...,-0.207683,-0.703844,0.16535,0.493736,0.114822,0.068947,-0.04135,0.008041,1.79,0
1147,-1.458732,0.080556,1.104362,-0.530468,0.79983,-0.364564,0.25204,0.108522,0.024557,-0.08662,...,-0.189646,-0.251927,-0.225423,-0.183544,-0.283531,0.226709,-0.315048,-0.344817,1.157992,1


In [None]:
x_sample_4 = df_sample_4.drop(['Class'], axis=1)
y_sample_4 = df_sample_4.Class

In [None]:
y_sample_4.value_counts()

1    173
0    154
Name: Class, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_sample_4, y_sample_4, test_size=0.2, random_state=42, stratify=y_sample_4)

In [None]:
mm = MinMaxScaler()

x_train_scaled = pd.DataFrame(mm.fit_transform(x_train))
x_test_scaled = mm.transform(x_test)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
x_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.133474,0.676077,0.58147,0.651642,0.422481,0.715741,0.194736,0.584736,0.485675,0.323487,...,0.143243,0.681342,0.123947,0.241204,0.004699,0.678788,0.20094,0.415467,0.7258,0.009208
1,0.492422,0.343814,0.810098,0.102075,0.475724,0.232539,0.495792,0.626536,0.667876,0.058118,...,0.211213,0.413455,0.963369,0.154358,0.52646,0.580464,0.034631,0.519362,0.576003,0.083224
2,0.947607,0.33668,0.545567,0.565112,0.342339,0.363537,0.289979,0.657161,0.286957,0.420519,...,0.088527,0.236015,0.40428,0.160866,0.292353,0.766373,0.217478,0.53593,0.64957,0.064208
3,0.318441,0.469341,0.853564,0.527957,0.397398,0.341545,0.410628,0.659933,0.443603,0.348181,...,0.128861,0.358416,0.776113,0.341009,0.62938,0.450188,0.34514,0.363967,0.775866,0.040984
4,0.972764,0.320578,0.583335,0.433951,0.322008,0.346539,0.248024,0.638461,0.33521,0.366869,...,0.244479,0.422357,0.915732,0.138585,0.320253,0.784981,0.283691,0.539352,0.646293,0.027309


In [None]:
#knn
knn_params = {
    'n_neighbors':range(3,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_train_scaled, y_train)

print('Best Parameters', clf.best_params_)
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters {'n_neighbors': 4, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      0.89      0.94        28
           1       0.93      1.00      0.96        38

    accuracy                           0.95        66
   macro avg       0.96      0.95      0.95        66
weighted avg       0.96      0.95      0.95        66



In [None]:
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
log = GridSearchCV(lr, lr_params, cv=cv)
log.fit(x_train_scaled, y_train)

print('Best Parameters ', log.best_params_)
y_pred = log.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters  {'C': 0.01, 'penalty': None}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        28
           1       0.95      1.00      0.97        38

    accuracy                           0.97        66
   macro avg       0.97      0.96      0.97        66
weighted avg       0.97      0.97      0.97        66



420 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/nitanshjain/.local/share/virtualenvs/Pred_Analytics-YCssu9nt/lib/python3.10/site-packages/sklearn/linear_model/_logistic

In [None]:
svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
svm = GridSearchCV(svc, svc_params, cv=cv)
svm.fit(x_train_scaled, y_train)

print('Best Parameters ', svm.best_params_)
y_pred = svm.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'degree': 4, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        28
           1       0.95      1.00      0.97        38

    accuracy                           0.97        66
   macro avg       0.97      0.96      0.97        66
weighted avg       0.97      0.97      0.97        66



In [None]:
rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
rfc_clf = GridSearchCV(rfc, rfc_params, cv=cv)
rfc_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', rfc_clf.best_params_)
y_pred = rfc_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 11}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        38

    accuracy                           1.00        66
   macro avg       1.00      1.00      1.00        66
weighted avg       1.00      1.00      1.00        66



In [None]:
dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, dt_params, cv=cv)
dt_clf.fit(x_train_scaled, y_train)

print('Best Parameters ', dt_clf.best_params_)
y_pred = dt_clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))

Best Parameters  {'criterion': 'gini', 'max_depth': 12, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       0.96      0.86      0.91        28
           1       0.90      0.97      0.94        38

    accuracy                           0.92        66
   macro avg       0.93      0.92      0.92        66
weighted avg       0.93      0.92      0.92        66



# Snowball Sampling

In [None]:
# Select the initial sample
initial_sample = df_new[df_new['Class'] == True]

# Create an empty list to store the snowball sample
snowball_sample = []

# Add the initial sample to the snowball sample
snowball_sample.extend(initial_sample)

# Define the maximum number of iterations
max_iterations = 10

# Start the snowball sampling
for i in range(max_iterations):
    # Select the referrals from the previous sample
    referrals = df_new[df_new['referral'].isin(snowball_sample['id'])]
    
    # Select a random sample of referrals
    new_sample = referrals.sample(n=10)
    
    # Add the new sample to the snowball sample
    snowball_sample.extend(new_sample)
    
# Convert the snowball sample to a dataframe
snowball_sample_df = pd.DataFrame(snowball_sample)

In [None]:
snowball_sample_df.head()