In [None]:
#1.
import pandas as pd

# Load dataset
df = pd.read_csv("Creditcard_data.csv")

# Basic checks
print(df.head())
print(df.shape)
print(df['Class'].value_counts())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
#2.
X = df.drop('Class', axis=1)
y = df['Class']


In [4]:
pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = rus.fit_resample(X, y)


In [6]:
print(pd.Series(y_balanced).value_counts())


Class
0    9
1    9
Name: count, dtype: int64


In [7]:
balanced_df = pd.concat([X_balanced, y_balanced], axis=1)
print(balanced_df.shape)


(18, 31)


In [8]:
#3.
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [9]:
sample1 = balanced_df.sample(frac=0.8, random_state=1)
sample2 = balanced_df.sample(frac=0.8, random_state=2)
sample3 = balanced_df.sample(frac=0.8, random_state=3)
sample4 = balanced_df.sample(frac=0.8, random_state=4)
sample5 = balanced_df.sample(frac=0.8, random_state=5)


In [10]:
print(sample1.shape)
print(sample2.shape)
print(sample3.shape)
print(sample4.shape)
print(sample5.shape)


(14, 31)
(14, 31)
(14, 31)
(14, 31)
(14, 31)


In [11]:
print(sample1['Class'].value_counts())


Class
0    8
1    6
Name: count, dtype: int64


In [21]:
#4.
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

sampling_techniques = {
    "Sampling1": RandomUnderSampler(random_state=42),
    "Sampling2": RandomOverSampler(random_state=42),
    "Sampling3": SMOTE(random_state=42, k_neighbors=1),
    "Sampling4": SMOTEENN(random_state=42, smote=SMOTE(k_neighbors=1)),
    "Sampling5": SMOTETomek(random_state=42, smote=SMOTE(k_neighbors=1))
}


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(random_state=42),
    "M3": RandomForestClassifier(random_state=42),
    "M4": SVC(),
    "M5": KNeighborsClassifier(n_neighbors=3)

}


In [23]:
#5.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
samples = {
    "Sample1": sample1,
    "Sample2": sample2,
    "Sample3": sample3,
    "Sample4": sample4,
    "Sample5": sample5
}


In [24]:
results = pd.DataFrame(
    index=models.keys(),
    columns=sampling_techniques.keys()
)


In [27]:
for sample_name, sample_df in samples.items():
    
    X = sample_df.drop('Class', axis=1)
    y = sample_df['Class']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    for samp_name, sampler in sampling_techniques.items():
        
       X_res, y_res = sampler.fit_resample(X_train, y_train)

for model_name, model in models.items():
    try:
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.loc[model_name, samp_name] = round(acc * 100, 2)
    except:
        # If model fails due to single class or any issue
        results.loc[model_name, samp_name] = 0


In [28]:
print(results)


   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1      60.0      60.0      60.0      80.0      60.0
M2      80.0      80.0      80.0      80.0      80.0
M3      60.0      60.0      80.0      80.0      40.0
M4      60.0      60.0      60.0      80.0      60.0
M5      60.0      60.0      60.0      80.0      60.0


In [None]:
# Model-wise Best Sampling Technique

# M1 (Logistic Regression) → Sampling4 (SMOTEENN) gives highest accuracy (80%)

# M2 (Decision Tree) → All sampling techniques perform equally well (80%)

# M3 (Random Forest) → Sampling3 (SMOTE) and Sampling4 (SMOTEENN) give highest accuracy (80%)

# M4 (SVM) → Sampling4 (SMOTEENN) gives highest accuracy (80%)

# M5 (KNN) → Sampling4 (SMOTEENN) gives highest accuracy (80%)

#  Overall best performing sampling technique: Sampling4 (SMOTEENN)