In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def load_data():
    df = pd.read_csv("labeled_transactions.csv")

    # Check if the dataset has fewer than 30,000 rows and adjust sampling accordingly
    sample_size = min(30000, len(df))  # Ensure we don't sample more than available rows
    df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

    # Drop non-informative columns
    df = df.drop(columns=['Transaction_ID', 'User_Name', 'Transaction_Date'])

    # Encode label
    df['Transaction_Value_Label'] = df['Transaction_Value_Label'].map({'High': 1, 'Low': 0})

    # One-hot encode categorical features
    df = pd.get_dummies(df, drop_first=True)

    return df


df = load_data()
print(df.shape)
X = df.drop("Transaction_Value_Label", axis=1)
y = df["Transaction_Value_Label"]

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction (optional, can be tuned)
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train Random Forest with more regularization (adjusted parameters)
rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=20, min_samples_leaf=10, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_acc)
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))

# Train KNN with more regularization (adjusted n_neighbors)
knn = KNeighborsClassifier(n_neighbors=30)  # Increased n_neighbors to reduce overfitting
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
print("KNN Accuracy:", knn_acc)
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, knn_pred))
print("KNN Report:\n", classification_report(y_test, knn_pred))

# Optionally, use cross-validation with Stratified K-Folds for more robust performance estimate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_cv = cross_val_score(rf, X_pca, y, cv=cv, scoring='accuracy')
print("Random Forest Cross-Validation Accuracy:", np.mean(rf_cv))

knn_cv = cross_val_score(knn, X_pca, y, cv=cv, scoring='accuracy')
print("KNN Cross-Validation Accuracy:", np.mean(knn_cv))

# Save models
joblib.dump(rf, "rf_model.pkl")
joblib.dump(knn, "knn_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")


(6239, 24)
Random Forest Accuracy: 0.9110576923076923
Random Forest Confusion Matrix:
 [[840  37]
 [ 74 297]]
Random Forest Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       877
           1       0.89      0.80      0.84       371

    accuracy                           0.91      1248
   macro avg       0.90      0.88      0.89      1248
weighted avg       0.91      0.91      0.91      1248

KNN Accuracy: 0.7796474358974359
KNN Confusion Matrix:
 [[875   2]
 [273  98]]
KNN Report:
               precision    recall  f1-score   support

           0       0.76      1.00      0.86       877
           1       0.98      0.26      0.42       371

    accuracy                           0.78      1248
   macro avg       0.87      0.63      0.64      1248
weighted avg       0.83      0.78      0.73      1248

Random Forest Cross-Validation Accuracy: 0.9089600939691156
KNN Cross-Validation Accuracy: 0.7789711975407645


['pca.pkl']