In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [41]:
# Load the dataset
df = pd.read_csv("fraud_detection_updated.csv")

In [42]:
# Inspect the dataset
print(df.head())
print(df.info())
print(df.isnull().sum())

   TransactionID   Amount  MerchantID  TransactionType  Location  IsFraud  \
0              1  4189.27         688                1         7        0   
1              2  2659.71         109                1         1        0   
2              3   784.00         394                0         4        0   
3              4  3514.40         944                0         5        0   
4              5   369.07         475                0         6        0   

   Amount_per_Type  Transaction_Frequency  
0      2493.753424                      1  
1      2493.753424                      1  
2      2500.449450                      1  
3      2500.449450                      1  
4      2500.449450                      1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   TransactionID          100000 non-null  int64  
 1   

In [43]:
# Preprocessing
# Handle missing values (if any)
df.fillna(0, inplace=True)

In [44]:
# Convert categorical variables using Label Encoding
label_encoder = LabelEncoder()
df['TransactionType'] = label_encoder.fit_transform(df['TransactionType'])

In [45]:
# Feature Engineering
# Create new features (e.g., transaction amount per type)
df['Amount_per_Type'] = df.groupby('TransactionType')['Amount'].transform('mean')
df['Transaction_Frequency'] = df.groupby('TransactionID')['TransactionID'].transform('count')

In [46]:
# Split the dataset into features (X) and target (y)
X = df.drop(['TransactionID', 'IsFraud'], axis=1)
y = df['IsFraud']

In [47]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [48]:
# Drop irrelevant columns (e.g., Transaction ID or timestamps)
# Assuming there's a timestamp column, drop it
if 'Timestamp' in df.columns:
    df.drop('Timestamp', axis=1, inplace=True)

# Convert categorical variables using Label Encoding
label_encoder = LabelEncoder()
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

In [35]:
# Check if the 'TransactionDate' column exists
if 'TransactionDate' in df.columns:
    # Drop the 'TransactionDate' column
    df.drop('TransactionDate', axis=1, inplace=True)
    print("'TransactionDate' column has been dropped.")
else:
    print("'TransactionDate' column does not exist in the dataset.")

# Save the updated dataset to a new CSV file (or overwrite the existing file)
df.to_csv("fraud_detection_updated.csv", index=False)
print("Updated dataset saved to 'fraud_detection_updated.csv'.")

'TransactionDate' column has been dropped.
Updated dataset saved to 'fraud_detection_updated.csv'.


In [49]:
# Train a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

In [50]:
# Evaluate the model
y_pred = dt_classifier.predict(X_test)

In [51]:
print("Decision Tree Classifier Performance:")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1-Score: {f1_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Decision Tree Classifier Performance:
Precision: 0.005763688760806916
Recall: 0.006369426751592357
F1-Score: 0.006051437216338881
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     29686
           1       0.01      0.01      0.01       314

    accuracy                           0.98     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.98      0.98      0.98     30000



In [52]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

In [53]:
# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_dt_classifier = grid_search.best_estimator_

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [54]:
# Evaluate the tuned model
y_pred_tuned = best_dt_classifier.predict(X_test)

In [55]:
print("Tuned Decision Tree Classifier Performance:")
print(f"Precision: {precision_score(y_test, y_pred_tuned)}")
print(f"Recall: {recall_score(y_test, y_pred_tuned)}")
print(f"F1-Score: {f1_score(y_test, y_pred_tuned)}")
print(classification_report(y_test, y_pred_tuned))

Tuned Decision Tree Classifier Performance:
Precision: 0.005763688760806916
Recall: 0.006369426751592357
F1-Score: 0.006051437216338881
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     29686
           1       0.01      0.01      0.01       314

    accuracy                           0.98     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.98      0.98      0.98     30000

