<a href="https://colab.research.google.com/github/nimisha870/Attendance-1/blob/master/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


data = pd.read_csv("https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv")

print("Original Class Distribution:\n", data['Class'].value_counts())


minority_class = data[data['Class'] == 1]
majority_class = data[data['Class'] == 0]

minority_upsampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),
    random_state=42
)

balanced_data = pd.concat([majority_class, minority_upsampled])
print("\nBalanced Class Distribution:\n", balanced_data['Class'].value_counts())

samples = {}

samples['Sampling1'] = balanced_data.sample(frac=0.5, random_state=42)


samples['Sampling2'], _ = train_test_split(
    balanced_data,
    test_size=0.5,
    stratify=balanced_data['Class'],
    random_state=42
)

clusters = balanced_data.groupby('Time', group_keys=False).apply(
    lambda x: x.sample(1, random_state=42)
)
samples['Sampling3'] = clusters

samples['Sampling4'] = pd.concat([
    majority_class.sample(frac=0.3, random_state=42),
    minority_upsampled.sample(frac=0.7, random_state=42)
])


samples['Sampling5'] = pd.concat([
    majority_class.sample(frac=0.3, random_state=42),
    minority_class.sample(frac=1.0, random_state=42)
])

models = {
    'M1': LogisticRegression(max_iter=1000),
    'M2': RandomForestClassifier(random_state=42),
    'M3': SVC(kernel='linear', random_state=42),
    'M4': DecisionTreeClassifier(random_state=42),
    'M5': KNeighborsClassifier()
}


def train_and_evaluate(X, y):
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy

    return results

final_results = {}

for name, sample in samples.items():

    if len(sample['Class'].unique()) < 2:
        print(f"Skipping {name} because it contains only one class.")
        continue

    X_sample = sample.drop(columns=['Class'])
    y_sample = sample['Class']


    results = train_and_evaluate(X_sample, y_sample)
    final_results[name] = results

final_results_df = pd.DataFrame(final_results)
print("\nFinal Results:\n", final_results_df)

best_sampling_techniques = {}

for model in models.keys():

    model_accuracies = final_results_df.loc[model]


    best_sampling_techniques[model] = model_accuracies.idxmax(), model_accuracies.max()


print("\nBest Sampling Technique for Each Model:\n")
for model, (best_sampling, accuracy) in best_sampling_techniques.items():
    print(f"Model: {model}, Best Sampling Technique: {best_sampling}, Accuracy: {accuracy:.4f}")


final_results_df.to_csv("sampling_results.csv", index=True)
print("Results saved to 'sampling_results.csv'")

Original Class Distribution:
 Class
0    763
1      9
Name: count, dtype: int64

Balanced Class Distribution:
 Class
0    763
1    763
Name: count, dtype: int64


  clusters = balanced_data.groupby('Time', group_keys=False).apply(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.ht


Final Results:
     Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1   0.921397   0.908297      0.976   0.917031   0.958333
M2   1.000000   1.000000      0.976   0.995633   0.958333
M3   0.925764   0.917031      0.976   0.943231   0.944444
M4   0.982533   0.991266      0.936   0.973799   0.930556
M5   0.951965   0.943231      0.976   0.934498   0.958333

Best Sampling Technique for Each Model:

Model: M1, Best Sampling Technique: Sampling3, Accuracy: 0.9760
Model: M2, Best Sampling Technique: Sampling1, Accuracy: 1.0000
Model: M3, Best Sampling Technique: Sampling3, Accuracy: 0.9760
Model: M4, Best Sampling Technique: Sampling2, Accuracy: 0.9913
Model: M5, Best Sampling Technique: Sampling3, Accuracy: 0.9760
Results saved to 'sampling_results.csv'
