In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

# Load your dataset
df = pd.read_csv('../data/processed/df_fraud_processed.csv')

# Quick look
df.head()


In [None]:
# Replace 'class' with your actual target column name
target_col = 'class'

# Drop irrelevant columns (adjust as needed)
drop_cols = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']  # example

X = df.drop(columns=drop_cols + [target_col])
y = df[target_col]

# Optional: fill missing values or do encoding here if needed
X = X.fillna(0)  # simple fill for missing values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_scaled, y_train)


In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)


In [None]:
def evaluate_model(model, X_test, y_test, scaled=False):
    if scaled:
        preds_proba = model.predict_proba(X_test)[:, 1]
        preds = model.predict(X_test)
    else:
        preds_proba = model.predict_proba(X_test)[:, 1]
        preds = model.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, preds))
    print("\nClassification Report:")
    print(classification_report(y_test, preds))

    precision, recall, _ = precision_recall_curve(y_test, preds_proba)
    auc_pr = auc(recall, precision)
    print(f"AUC-PR: {auc_pr:.4f}")

print("Logistic Regression Performance:")
evaluate_model(lr, X_test_scaled, y_test, scaled=True)

print("\nRandom Forest Performance:")
evaluate_model(rf, X_test, y_test, scaled=False)
