In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/Creditcard_data.csv")

print(df.head())
print(df.shape)


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [2]:
print(df["Class"].value_counts())


Class
0    763
1      9
Name: count, dtype: int64


In [7]:
from imblearn.over_sampling import SMOTE

X = df.drop("Class", axis=1)
Y = df["Class"]

smote = SMOTE(random_state=42)
X_balanced, Y_balanced = smote.fit_resample(X, Y)

print("Balanced Class Counts:")
print(Y_balanced.value_counts())


Balanced Class Counts:
Class
0    763
1    763
Name: count, dtype: int64


In [9]:
from sklearn.model_selection import train_test_split

samples = []

for i in range(5):
    X_sample, _, Y_sample, _ = train_test_split(
        X_balanced, Y_balanced,
        train_size=0.3,
        random_state=i
    )
    samples.append((X_sample, Y_sample))

print("Created 5 samples.")

Created 5 samples.


In [10]:
from imblearn.over_sampling import RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "M1": LogisticRegression(),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": KNeighborsClassifier(),
    "M5": SVC()
}

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

sampling_methods = {
    "Sampling1": RandomOverSampler(),
    "Sampling2": RandomUnderSampler(),
    "Sampling3": SMOTE(),
    "Sampling4": ADASYN(),
    "Sampling5": SMOTETomek()
}

results = {}

for model_name, model in models.items():
    results[model_name] = {}

    for samp_name, sampler in sampling_methods.items():

        X_res, y_res = sampler.fit_resample(X, Y)

        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=0.2, random_state=42
        )

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)

        results[model_name][samp_name] = acc

print(results)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'M1': {'Sampling1': 0.9150326797385621, 'Sampling2': 0.25, 'Sampling3': 0.8986928104575164, 'Sampling4': 0.9117647058823529, 'Sampling5': 0.919732441471572}, 'M2': {'Sampling1': 1.0, 'Sampling2': 0.5, 'Sampling3': 0.9705882352941176, 'Sampling4': 0.9673202614379085, 'Sampling5': 0.9832775919732442}, 'M3': {'Sampling1': 1.0, 'Sampling2': 0.0, 'Sampling3': 0.9934640522875817, 'Sampling4': 0.9934640522875817, 'Sampling5': 0.9966442953020134}, 'M4': {'Sampling1': 0.9869281045751634, 'Sampling2': 0.0, 'Sampling3': 0.8627450980392157, 'Sampling4': 0.8333333333333334, 'Sampling5': 0.87248322147651}, 'M5': {'Sampling1': 0.673202614379085, 'Sampling2': 0.0, 'Sampling3': 0.6699346405228758, 'Sampling4': 0.6830065359477124, 'Sampling5': 0.6621621621621622}}


In [14]:
for model in results:
    best_sampling = max(results[model], key=results[model].get)
    print(model, "best with", best_sampling)


M1 best with Sampling5
M2 best with Sampling1
M3 best with Sampling1
M4 best with Sampling1
M5 best with Sampling4


In [15]:
import pandas as pd

accuracy_table = pd.DataFrame(results)

accuracy_table = accuracy_table.T

print(accuracy_table)


    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1   0.915033       0.25   0.898693   0.911765   0.919732
M2   1.000000       0.50   0.970588   0.967320   0.983278
M3   1.000000       0.00   0.993464   0.993464   0.996644
M4   0.986928       0.00   0.862745   0.833333   0.872483
M5   0.673203       0.00   0.669935   0.683007   0.662162


In [16]:
accuracy_table = accuracy_table * 100

accuracy_table = accuracy_table.round(2)

print(accuracy_table)


    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      91.50       25.0      89.87      91.18      91.97
M2     100.00       50.0      97.06      96.73      98.33
M3     100.00        0.0      99.35      99.35      99.66
M4      98.69        0.0      86.27      83.33      87.25
M5      67.32        0.0      66.99      68.30      66.22


In [17]:
from tabulate import tabulate

print(tabulate(accuracy_table, headers="keys", tablefmt="grid"))


+----+-------------+-------------+-------------+-------------+-------------+
|    |   Sampling1 |   Sampling2 |   Sampling3 |   Sampling4 |   Sampling5 |
| M1 |       91.5  |          25 |       89.87 |       91.18 |       91.97 |
+----+-------------+-------------+-------------+-------------+-------------+
| M2 |      100    |          50 |       97.06 |       96.73 |       98.33 |
+----+-------------+-------------+-------------+-------------+-------------+
| M3 |      100    |           0 |       99.35 |       99.35 |       99.66 |
+----+-------------+-------------+-------------+-------------+-------------+
| M4 |       98.69 |           0 |       86.27 |       83.33 |       87.25 |
+----+-------------+-------------+-------------+-------------+-------------+
| M5 |       67.32 |           0 |       66.99 |       68.3  |       66.22 |
+----+-------------+-------------+-------------+-------------+-------------+


In [18]:
accuracy_table.to_csv("Accuracy_Table.csv")
