In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Function to process chunks of data
def process_chunk(chunk):
    chunk['trans_date_trans_time'] = pd.to_datetime(chunk['trans_date_trans_time'])
    chunk['dob'] = pd.to_datetime(chunk['dob'])
    
    chunk['trans_year'] = chunk['trans_date_trans_time'].dt.year
    chunk['trans_month'] = chunk['trans_date_trans_time'].dt.month
    chunk['trans_day'] = chunk['trans_date_trans_time'].dt.day
    chunk['trans_hour'] = chunk['trans_date_trans_time'].dt.hour
    chunk['trans_minute'] = chunk['trans_date_trans_time'].dt.minute
    chunk['trans_second'] = chunk['trans_date_trans_time'].dt.second
    
    chunk['age'] = (chunk['trans_date_trans_time'] - chunk['dob']).dt.days // 365
    chunk['distance'] = np.sqrt((chunk['lat'] - chunk['merch_lat'])**2 + (chunk['long'] - chunk['merch_long'])**2)
    
    return chunk

# Feature selection
features = ['amt', 'gender', 'lat', 'long', 'city_pop', 'trans_year', 'trans_month', 'trans_day', 'trans_hour', 'trans_minute', 'trans_second', 'age', 'distance']

# Load and process data in chunks and sample 10%
def load_and_sample_data(file_path, chunksize=10000, sample_frac=0.1):
    chunk_list = []
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunk = process_chunk(chunk)
        chunk = chunk[features + ['is_fraud']]
        chunk_list.append(chunk.sample(frac=sample_frac, random_state=42))
    return pd.concat(chunk_list, axis=0)

# Load and process data with 10% sampling
train_data = load_and_sample_data('fraudTrain.csv')
test_data = load_and_sample_data('fraudTest.csv')

# Encoding categorical variables
train_data = pd.get_dummies(train_data, columns=['gender'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['gender'], drop_first=True)

# Splitting the data
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [2]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Performance:")
print(classification_report(y_test, y_pred_dt))

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55333
           1       0.00      0.00      0.00       239

    accuracy                           1.00     55572
   macro avg       0.50      0.50      0.50     55572
weighted avg       0.99      1.00      0.99     55572

Decision Tree Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55333
           1       0.38      0.51      0.43       239

    accuracy                           0.99     55572
   macro avg       0.69      0.75      0.72     55572
weighted avg       1.00      0.99      0.99     55572

Random Forest Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55333
           1       0.91      0.30      0.45       239

    accuracy                           1.00     55572
   macro avg       0.95      0.65      0.

In [None]:
import joblib

# Save the best model (assuming Random Forest is the best based on performance)
joblib.dump(rf_model, 'fraud_detection_model.pkl')
