In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [2]:
data_url = "Creditcard_data.csv"
dataset = pd.read_csv(data_url)

In [3]:
X = dataset.drop("Class", axis=1)
y = dataset["Class"]
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [4]:
def create_samples(X, y, sample_size):
    samples = []
    for _ in range(5):
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, random_state=np.random.randint(100))
        samples.append((X_sample, y_sample))
    return samples
sample_size = 0.2 
samples = create_samples(X_balanced, y_balanced, sample_size)

In [5]:
sampling_techniques = {
    "Sampling1": lambda X, y: (X, y),
    "Sampling2": RandomUnderSampler(random_state=42).fit_resample,
    "Sampling3": SMOTE(random_state=42).fit_resample,
    "Sampling4": SMOTEENN(random_state=42).fit_resample,
    "Sampling5": lambda X, y: (X.sample(frac=0.8, random_state=42), y.sample(frac=0.8, random_state=42)),
}

In [6]:
models = {
    "M1": RandomForestClassifier(random_state=42),
    "M2": LogisticRegression(max_iter=1000, random_state=42),
    "M3": DecisionTreeClassifier(random_state=42),
    "M4": SVC(random_state=42),
    "M5": KNeighborsClassifier(),
}

In [7]:
results = pd.DataFrame(index=models.keys(), columns=sampling_techniques.keys())
for sample_idx, (X_sample, y_sample) in enumerate(samples):
    for sampling_name, sampling_function in sampling_techniques.items():
        X_resampled, y_resampled = sampling_function(X_sample, y_sample)
        for model_name, model in models.items():
            X_train, X_test, y_train, y_test = train_test_split(
                X_resampled, y_resampled, test_size=0.3, random_state=42
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            if pd.isna(results.loc[model_name, sampling_name]):
                results.loc[model_name, sampling_name] = 0
            results.loc[model_name, sampling_name] += accuracy / len(samples)

In [9]:
best_sampling = results.idxmax(axis=1)
best_sampling_with_accuracy = {model: (technique, results.loc[model, technique]) for model, technique in best_sampling.items()}
print("Best Sampling Technique and Accuracy for Each Model:")
for model, (technique, accuracy) in best_sampling_with_accuracy.items():
    print(f"{model}: {technique} with accuracy {accuracy:.2f}")

Best Sampling Technique and Accuracy for Each Model:
M1: Sampling1 with accuracy 0.99
M2: Sampling4 with accuracy 0.94
M3: Sampling3 with accuracy 0.95
M4: Sampling4 with accuracy 0.78
M5: Sampling4 with accuracy 0.93


In [10]:
results.to_csv("samplingmodelaccuracyresults.csv")