In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

In [11]:
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

In [12]:
def preprocess_data(train, test, target_column='is_fraud'):
    train = train.dropna(subset=[target_column])
    test = test.dropna(subset=[target_column])
    
    x_train = train.drop([target_column], axis=1)
    y_train = train[target_column]
    x_test = test.drop([target_column], axis=1)
    y_test = test[target_column]
    
    x_train = x_train.select_dtypes(include=['int64','float64'])
    x_test = x_test.select_dtypes(include=['int64','float64'])
    
    n = [col for col in x_train.columns if train[col].isnull().sum() > 0]
    x_train.drop(n, axis=1, inplace=True)
    x_test.drop(n, axis=1, inplace=True)
    
    scale = StandardScaler()
    x_train = scale.fit_transform(x_train)
    x_test = scale.transform(x_test)
    
    return x_train, y_train, x_test, y_test

def feature_selection(x_train, y_train, k=10):
    selector = SelectKBest(score_func=f_classif, k=k)
    x_train_selected = selector.fit_transform(x_train, y_train)
    return x_train_selected, selector

In [13]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test, model_name="Model"):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print(f"\n{model_name} Results:")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    
    return accuracy_score(y_test, y_pred), classification_report(y_test, y_pred, output_dict=True)

def main():
    train, test = load_data('fraudTrain.csv', 'fraudTest.csv')
    x_train, y_train, x_test, y_test = preprocess_data(train, test)
    x_train, selector = feature_selection(x_train, y_train, k=10)
    x_test = selector.transform(x_test)

    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(),
        "Decision Tree": DecisionTreeClassifier()
    }

    results = {}
    for model_name, model in models.items():
        acc, report = train_and_evaluate_model(model, x_train, y_train, x_test, y_test, model_name)
        results[model_name] = {"accuracy": acc, "report": report}

    print("\nSummary of Results:")
    for model_name, result in results.items():
        print(f"{model_name} - Accuracy: {result['accuracy']}")
        print(f"Classification Report:\n{result['report']}")

if __name__ == "__main__":
    main()



Logistic Regression Results:
Confusion Matrix:
 [[553222    352]
 [  2145      0]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Accuracy Score: 0.9955067219224104

Random Forest Results:
Confusion Matrix:
 [[553246    328]
 [  1959    186]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.36      0.09      0.14      2145

    accuracy                           1.00    555719
   macro avg       0.68      0.54      0.57    555719
weighted avg       0.99      1.00      0.99    555719

Accuracy Score: 0.9958846107475181

Decision Tree Results:
Confusion Matrix:
 [[508699  44875]
 [  12