# Fraud Detection Analysis and Modeling

This notebook covers the end-to-end process of fraud detection on the provided dataset.

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from xgboost import XGBClassifier

%matplotlib inline
sns.set(style='whitegrid')

## 2. Data Loading

In [None]:
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

print(f'Training shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')
train_df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values
print(train_df.isnull().sum())

In [None]:
# Class Imbalance
sns.countplot(x='FraudResult', data=train_df)
plt.title('Class Distribution')
plt.show()

print(train_df['FraudResult'].value_counts(normalize=True))

In [None]:
# Numerical Distributions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(train_df['Amount'], bins=50, kde=True)
plt.title('Distribution of Amount')

plt.subplot(1, 2, 2)
sns.histplot(train_df[train_df['FraudResult']==1]['Amount'], bins=50, kde=True, color='red')
plt.title('Distribution of Amount (Fraud Only)')
plt.show()

## 4. Preprocessing

In [None]:
# Convert TransactionStartTime to datetime
train_df['TransactionStartTime'] = pd.to_datetime(train_df['TransactionStartTime'])
test_df['TransactionStartTime'] = pd.to_datetime(test_df['TransactionStartTime'])

# Extract features
for df in [train_df, test_df]:
    df['Hour'] = df['TransactionStartTime'].dt.hour
    df['Day'] = df['TransactionStartTime'].dt.day
    df['Month'] = df['TransactionStartTime'].dt.month
    df['Weekday'] = df['TransactionStartTime'].dt.weekday

# Drop columns that might not be useful or cause leakage
cols_to_drop = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'TransactionStartTime']
# Note: Keeping ProviderId, ProductId, etc. for encoding

X = train_df.drop(columns=['FraudResult'] + cols_to_drop)
y = train_df['FraudResult']
X_test_submit = test_df.drop(columns=cols_to_drop)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(f'Categorical: {categorical_cols}')
print(f'Numerical: {numerical_cols}')

In [None]:
# Preprocessing Pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## 5. Modeling

In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=99) # Handling imbalance
}

results = {}

for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    y_prob = clf.predict_proba(X_val)[:, 1]
    
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)
    
    results[name] = {'F1': f1, 'AUC': auc}
    print(f'--- {name} ---')
    print(classification_report(y_val, y_pred))
    print(f'AUC: {auc:.4f}')

print(results)

## 6. Submission

In [None]:
# Train best model on full dataset
best_model_name = max(results, key=lambda x: results[x]['F1'])
print(f'Best Model: {best_model_name}')

final_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', models[best_model_name])])
final_model.fit(X, y)

# Predict on test set
test_preds = final_model.predict(X_test_submit)

# Create submission file
submission = pd.DataFrame({'TransactionId': test_df['TransactionId'], 'FraudResult': test_preds})
submission.to_csv('submission.csv', index=False)
print('Submission saved to submission.csv')