In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Models (M1-M5)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler



In [7]:
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

print(df.shape)
df.head()


(772, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
X = df.drop("Class", axis=1)
y = df["Class"]

ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

balanced_df = pd.DataFrame(X_bal, columns=X.columns)
balanced_df["Class"] = y_bal

print("Balanced Class Distribution:\n", balanced_df["Class"].value_counts())


Balanced Class Distribution:
 Class
0    763
1    763
Name: count, dtype: int64


In [9]:
def simple_random_sampling(data, frac=0.3, seed=42):
    return data.sample(frac=frac, random_state=seed)


In [10]:
def stratified_sampling(data, target_col="Class", frac=0.3, seed=42):
    sample_parts = []
    for cls in data[target_col].unique():
        cls_data = data[data[target_col] == cls]
        sample_parts.append(cls_data.sample(frac=frac, random_state=seed))
    return pd.concat(sample_parts).sample(frac=1, random_state=seed).reset_index(drop=True)


In [11]:
def systematic_sampling(data, step=3):
    return data.iloc[::step].reset_index(drop=True)


In [12]:
def cluster_sampling(data, n_clusters=10, clusters_to_pick=3, seed=42):
    np.random.seed(seed)

    data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
    cluster_size = len(data) // n_clusters

    clusters = []
    for i in range(n_clusters):
        start = i * cluster_size
        end = (i+1) * cluster_size if i != n_clusters-1 else len(data)
        clusters.append(data.iloc[start:end])

    chosen_idx = np.random.choice(range(n_clusters), size=clusters_to_pick, replace=False)
    sampled = pd.concat([clusters[i] for i in chosen_idx])
    return sampled.reset_index(drop=True)


In [13]:
def bootstrap_sampling(data, n_samples=None, seed=42):
    np.random.seed(seed)
    if n_samples is None:
        n_samples = len(data)
    return data.sample(n=n_samples, replace=True, random_state=seed).reset_index(drop=True)


In [14]:
samples = {
    "Sampling1_SimpleRandom": simple_random_sampling(balanced_df, frac=0.3, seed=42),
    "Sampling2_Stratified": stratified_sampling(balanced_df, frac=0.3, seed=42),
    "Sampling3_Systematic": systematic_sampling(balanced_df, step=3),
    "Sampling4_Cluster": cluster_sampling(balanced_df, n_clusters=10, clusters_to_pick=3, seed=42),
    "Sampling5_Bootstrap": bootstrap_sampling(balanced_df, n_samples=int(0.3*len(balanced_df)), seed=42)
}

for name, samp in samples.items():
    print(name, "->", samp.shape)


Sampling1_SimpleRandom -> (458, 31)
Sampling2_Stratified -> (458, 31)
Sampling3_Systematic -> (509, 31)
Sampling4_Cluster -> (456, 31)
Sampling5_Bootstrap -> (457, 31)


In [15]:
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=200),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_NaiveBayes": GaussianNB(),
    "M4_RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "M5_SVM": SVC()
}


In [16]:
results = pd.DataFrame(index=models.keys(), columns=samples.keys())

for model_name, model in models.items():
    for samp_name, samp_df in samples.items():

        Xs = samp_df.drop("Class", axis=1)
        ys = samp_df["Class"]

        # Train-Test split inside each sample
        X_train, X_test, y_train, y_test = train_test_split(
            Xs, ys, test_size=0.2, random_state=42, stratify=ys
        )

        # Scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Train
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Accuracy
        acc = accuracy_score(y_test, y_pred) * 100
        results.loc[model_name, samp_name] = acc

results = results.astype(float)
results.round(2)


Unnamed: 0,Sampling1_SimpleRandom,Sampling2_Stratified,Sampling3_Systematic,Sampling4_Cluster,Sampling5_Bootstrap
M1_LogisticRegression,90.22,86.96,89.22,94.57,95.65
M2_DecisionTree,94.57,97.83,95.1,100.0,98.91
M3_NaiveBayes,76.09,61.96,80.39,64.13,71.74
M4_RandomForest,100.0,98.91,100.0,100.0,100.0
M5_SVM,95.65,94.57,96.08,97.83,97.83


In [19]:
print("Best Sampling Technique for Each Model:\n")

best_sampling_each_model = results.idxmax(axis=1)
best_accuracy_each_model = results.max(axis=1)

for m in models.keys():
    print(f"{m} --> Best: {best_sampling_each_model[m]} | Accuracy: {best_accuracy_each_model[m]:.2f}%")


Best Sampling Technique for Each Model:

M1_LogisticRegression --> Best: Sampling5_Bootstrap | Accuracy: 95.65%
M2_DecisionTree --> Best: Sampling4_Cluster | Accuracy: 100.00%
M3_NaiveBayes --> Best: Sampling3_Systematic | Accuracy: 80.39%
M4_RandomForest --> Best: Sampling1_SimpleRandom | Accuracy: 100.00%
M5_SVM --> Best: Sampling4_Cluster | Accuracy: 97.83%


In [20]:
best_model, best_sampling = results.stack().idxmax()
best_acc = results.max().max()

print("\n Overall Best Combination:")
print("Model:", best_model)
print("Sampling:", best_sampling)
print("Accuracy:", round(best_acc, 2), "%")



 Overall Best Combination:
Model: M2_DecisionTree
Sampling: Sampling4_Cluster
Accuracy: 100.0 %
