# Importing Libraries

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


## Importing DataSet

In [44]:
# Load training dataset
train_data = pd.read_csv('Data/fraudTrain.csv')

# Load testing dataset
test_data = pd.read_csv('Data/fraudTest.csv')

In [45]:
train_data.info()

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

# Preprocessing for training & testing data

In [46]:

X_train = train_data.drop(columns=['is_fraud'])  # Features
y_train = train_data['is_fraud']  # Target variable

X_test = test_data.drop(columns=['is_fraud'])  # Features
y_test = test_data['is_fraud']  # Target variable

# Data Preprocessing and Normalization

In [47]:
numeric_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
categorical_features = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']


# Preprocessing Pipeline

In [48]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 'ignore' unknown categories
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model Selection

In [49]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Train model

In [50]:
model.fit(X_train, y_train)

# Make predictions

In [None]:
y_pred = model.predict(X_test)

# Evaluate model

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

# Model Score

In [None]:

print("Model Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"AUC-ROC: {roc_auc}")

Accuracy: 0.843
Precision: 0.7401574803149606
Recall: 0.43119266055045874
F1-score: 0.5449275362318841
ROC-AUC: 0.6944965860297051
