1. **Dataset**

In [19]:
import pandas as pd

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [20]:
data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


 2. **BALANCE THE DATASET USING SMOTE**

In [36]:
X = data.drop('Class', axis=1)
y = data['Class']

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

3. **CREATE FIVE SAMPLES**

In [40]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)

balanced_df = pd.DataFrame(X_balanced)
balanced_df['Class'] = y_balanced

1. Simple Random Sampling

In [41]:
sample1 = balanced_df.sample(frac=0.5, random_state=42)

2. Systematic Sampling

In [26]:
k = 5
sample2 = balanced_df.iloc[::k, :]

3. Stratified Sampling

In [27]:
from sklearn.model_selection import train_test_split

X_temp = balanced_df.drop('Class', axis=1)
y_temp = balanced_df['Class']

X_strat, _, y_strat, _ = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

sample3 = X_strat.copy()
sample3['Class'] = y_strat

4. Cluster Sampling

In [28]:
balanced_df['Cluster'] = balanced_df.index % 10

selected_clusters = np.random.choice(
    balanced_df['Cluster'].unique(), size=5, replace=False
)

sample4 = balanced_df[balanced_df['Cluster'].isin(selected_clusters)]
sample4 = sample4.drop('Cluster', axis=1)

5. Multistage Sampling

In [29]:
stage1_clusters = np.random.choice(
    balanced_df['Cluster'].unique(), size=6, replace=False
)

stage1_data = balanced_df[balanced_df['Cluster'].isin(stage1_clusters)]

sample5 = stage1_data.sample(frac=0.5, random_state=42)
sample5 = sample5.drop('Cluster', axis=1)

4. **APPLY FIVE MODELS ON FIVE SAMPLES**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(n_estimators=100),
    "M4_SVM": SVC(),
    "M5_NaiveBayes": GaussianNB()
}

In [32]:
sampling_methods = {
    "SimpleRandom": sample1,
    "Systematic": sample2,
    "Stratified": sample3,
    "Cluster": sample4,
    "Multistage": sample5
}

In [33]:
from sklearn.metrics import accuracy_score

results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

for samp_name, samp_df in sampling_methods.items():

    X_samp = samp_df.drop('Class', axis=1)
    y_samp = samp_df['Class']

    X_train, X_test, y_train, y_test = train_test_split(
        X_samp, y_samp, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds) * 100
        results.loc[model_name, samp_name] = round(acc, 2)


5. **RESULT & CONCLUSION**

In [34]:
results

Unnamed: 0,SimpleRandom,Systematic,Stratified,Cluster,Multistage
M1_LogisticRegression,88.21,78.26,90.83,94.32,91.3
M2_DecisionTree,95.63,89.13,97.38,96.07,93.48
M3_RandomForest,99.56,95.65,99.13,100.0,98.55
M4_SVM,95.2,90.22,97.38,98.69,94.93
M5_NaiveBayes,71.18,67.39,75.11,75.98,69.57


In [35]:
results = results.astype(float)
best_sampling = {}

for model in results.index:
    best_method = results.loc[model].idxmax()
    best_accuracy = results.loc[model].max()
    best_sampling[model] = (best_method, best_accuracy)

for model, (method, acc) in best_sampling.items():
    print(f"{model} gives highest accuracy with {method}: {acc}%")


M1_LogisticRegression gives highest accuracy with Cluster: 94.32%
M2_DecisionTree gives highest accuracy with Stratified: 97.38%
M3_RandomForest gives highest accuracy with Cluster: 100.0%
M4_SVM gives highest accuracy with Cluster: 98.69%
M5_NaiveBayes gives highest accuracy with Cluster: 75.98%
