In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# 1. Load & Preprocess the Dataset
# Replace 'fraud_detection.csv' with the actual path if needed.
df = pd.read_csv('onlinefraud.csv')

# Inspect for missing values and fill them if necessary.
print("Missing values per column before imputation:")
print(df.isnull().sum())
df.fillna(method='ffill', inplace=True)

# Convert categorical variables using LabelEncoder (e.g., 'Type' column).
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# 2. Feature Engineering
# Create a derived feature. For example, here we create a log-transformed amount.
df['log_amount'] = np.log1p(df['amount'])

# Define features and target variable.
features = ['amount', 'type', 'log_amount']
target = 'isFraud'

X = df[features]
y = df[target]

# 3. Split the Dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Train a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Evaluate initial model performance.
y_pred = dt.predict(X_test)
print("Initial Model Evaluation:")
print(classification_report(y_test, y_pred))

# 5. Hyperparameter Tuning using GridSearchCV
param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1',
                           n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

# Evaluate the tuned model.
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Evaluation of Tuned Model:")
print(classification_report(y_test, y_pred_best))


Missing values per column before imputation:
step              0
type              0
amount            0
nameOrig          1
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64


  df.fillna(method='ffill', inplace=True)


Initial Model Evaluation:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   1123964
         1.0       0.15      0.15      0.15       914

    accuracy                           1.00   1124878
   macro avg       0.57      0.58      0.57   1124878
weighted avg       1.00      1.00      1.00   1124878

Best parameters found:
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Evaluation of Tuned Model:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   1123964
         1.0       0.72      0.17      0.28       914

    accuracy                           1.00   1124878
   macro avg       0.86      0.59      0.64   1124878
weighted avg       1.00      1.00      1.00   1124878

