In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import numpy as np

In [3]:
url_path = "Creditcard_data.csv"

# Load dataset using pandas
data_frame = pd.read_csv(url_path)

# Separate features and target column
features = data_frame.drop(columns=["Class"])
target = data_frame["Class"]

# Apply SMOTE for balancing the dataset
smote_handler = SMOTE(random_state=42)
features_balanced, target_balanced = smote_handler.fit_resample(features, target)

# Define sampling functions
def random_sampling_method(features_set, target_set, sample_size):
    return resample(features_set, target_set, n_samples=sample_size, random_state=42)

def stratified_sampling_method(features_set, target_set, sample_size):
    from sklearn.model_selection import StratifiedShuffleSplit
    stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=sample_size / len(target_set), random_state=42)
    for train_indices, _ in stratified_splitter.split(features_set, target_set):
        return features_set.iloc[train_indices], target_set.iloc[train_indices]

# Define sample sizes
sample_fractions = [int(len(features_balanced) * 0.1 * i) for i in range(1, 6)]

# Create sampled datasets
sampled_sets = {
    "Sample_1": random_sampling_method(features_balanced, target_balanced, sample_fractions[0]),
    "Sample_2": random_sampling_method(features_balanced, target_balanced, sample_fractions[1]),
    "Sample_3": stratified_sampling_method(features_balanced, target_balanced, sample_fractions[2]),
    "Sample_4": random_sampling_method(features_balanced, target_balanced, sample_fractions[3]),
    "Sample_5": stratified_sampling_method(features_balanced, target_balanced, sample_fractions[4]),
}

# Define machine learning models
ml_models = {
    "Logistic_Regression": LogisticRegression(),
    "Random_Forest": RandomForestClassifier(),
    "Decision_Tree": DecisionTreeClassifier(),
    "Gaussian_NB": GaussianNB(),
    "Support_Vector_Machine": SVC()
}

# Create a DataFrame to store results
results_table = pd.DataFrame(columns=["Sample", "Model", "Accuracy"])

# Train and evaluate models
for sample_key, (X_sample, y_sample) in sampled_sets.items():
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
    for model_key, model_instance in ml_models.items():
        model_instance.fit(X_train, y_train)
        y_predictions = model_instance.predict(X_test)
        model_accuracy = accuracy_score(y_test, y_predictions)
        results_table = pd.concat([results_table, pd.DataFrame([{"Sample": sample_key, "Model": model_key, "Accuracy": model_accuracy}])], ignore_index=True)

# Print the results
print(results_table)

# Save the results to a CSV file
results_table.to_csv("results_summary.csv", index=False)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  results_table = pd.concat([results_table, pd.DataFrame([{"Sample": sample_key, "Model": model_key, "Accuracy": model_accuracy}])], ignore_index=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scal

      Sample                   Model  Accuracy
0   Sample_1     Logistic_Regression  0.903226
1   Sample_1           Random_Forest  0.967742
2   Sample_1           Decision_Tree  0.903226
3   Sample_1             Gaussian_NB  0.903226
4   Sample_1  Support_Vector_Machine  0.709677
5   Sample_2     Logistic_Regression  0.950820
6   Sample_2           Random_Forest  1.000000
7   Sample_2           Decision_Tree  0.967213
8   Sample_2             Gaussian_NB  0.803279
9   Sample_2  Support_Vector_Machine  0.639344
10  Sample_3     Logistic_Regression  0.920561
11  Sample_3           Random_Forest  1.000000
12  Sample_3           Decision_Tree  0.976636
13  Sample_3             Gaussian_NB  0.789720
14  Sample_3  Support_Vector_Machine  0.700935
15  Sample_4     Logistic_Regression  0.909836
16  Sample_4           Random_Forest  0.983607
17  Sample_4           Decision_Tree  0.967213
18  Sample_4             Gaussian_NB  0.770492
19  Sample_4  Support_Vector_Machine  0.598361
20  Sample_5 