In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('fraudTest.csv')

# Display the first few rows
print(data.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 

In [None]:
# Convert timestamp to datetime
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# Extract useful features
data['hour'] = data['trans_date_trans_time'].dt.hour
data['day_of_week'] = data['trans_date_trans_time'].dt.dayofweek
data['month'] = data['trans_date_trans_time'].dt.month

# Drop the original timestamp column
data.drop('trans_date_trans_time', axis=1, inplace=True)

In [None]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['category', 'gender'], drop_first=True)

# Label encode high-cardinality columns (e.g., merchant)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['merchant'] = label_encoder.fit_transform(data['merchant'])

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the 'amt' column
scaler = StandardScaler()
data['amt'] = scaler.fit_transform(data[['amt']])

In [None]:
# Print column names to verify
print("Columns in the dataset:", data.columns)

# Drop irrelevant columns (only if they exist)
columns_to_drop = []
for col in ['Unnamed: 0', 'city', 'state', 'zip', 'job', 'dob', 'trans_num']:
    if col in data.columns:
        columns_to_drop.append(col)

if columns_to_drop:
    data = data.drop(columns_to_drop, axis=1)
    print("Dropped columns:", columns_to_drop)
else:
    print("No columns to drop.")

# Encode categorical columns (only if they exist)
columns_to_encode = []
for col in ['merchant', 'category_food_dining', 'category_gas_transport', 'category_grocery_net',
            'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets',
            'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net',
            'category_shopping_pos', 'category_travel', 'gender_M']:
    if col in data.columns:
        columns_to_encode.append(col)

if columns_to_encode:
    data = pd.get_dummies(data, columns=columns_to_encode, drop_first=True)
    print("Encoded columns:", columns_to_encode)
else:
    print("No categorical columns to encode.")

# Check for missing values in the target column
print("Missing values in target column (y):", data['is_fraud'].isnull().sum())

# Separate features and target
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Ensure all features are numeric
print("Data types in X:")
print(X.dtypes)

# Apply SMOTE to balance the dataset
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check the shape of the resampled data
print("Resampled data shape:", X_res.shape, y_res.shape)

Columns in the dataset: Index(['cc_num', 'amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'hour',
       ...
       'merchant_684', 'merchant_685', 'merchant_686', 'merchant_687',
       'merchant_688', 'merchant_689', 'merchant_690', 'merchant_691',
       'merchant_692', 'gender_M_True'],
      dtype='object', length=718)
No columns to drop.
Encoded columns: ['category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel']
Missing values in target column (y): 0
Data types in X:
cc_num                           int64
amt                            float64
lat                            float64
long                           float64
city_pop                         int64
                                ...   
c

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest to get feature importance
model = RandomForestClassifier(random_state=42)
model.fit(X_res, y_res)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

          Feature  Importance
1             amt    0.176962
3            long    0.081167
4        city_pop    0.080698
2             lat    0.069857
7      merch_long    0.068304
..            ...         ...
660  merchant_650    0.000000
340  merchant_330    0.000000
290  merchant_280    0.000000
12     merchant_2    0.000000
10          month    0.000000

[717 rows x 2 columns]


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("AUC-ROC Score:", roc_auc_score(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3142
         1.0       1.00      1.00      1.00      3072

    accuracy                           1.00      6214
   macro avg       1.00      1.00      1.00      6214
weighted avg       1.00      1.00      1.00      6214

AUC-ROC Score: 1.0
Confusion Matrix:
[[3142    0]
 [   0 3072]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
