# **Step 1: Data Preprocessing**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read File
ccfd_df = pd.read_csv('/content/drive/MyDrive/UnifiedMentor_Projects/Credit_Card_Fraud_Detection/Credit Card Fraud Detection .csv')
ccfd_df.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit
0,1280981422329509,Dallas,PA,18612,6000
1,9737219864179988,Houston,PA,15342,16000
2,4749889059323202,Auburn,MA,1501,14000
3,9591503562024072,Orlando,WV,26412,18000
4,2095640259001271,New York,NY,10001,20000


In [None]:
ccfd_df.shape

(984, 5)

In [None]:
#check for missing values of Credit Card Fraud Detection file
ccfd_df.isnull().sum()

credit_card          0
city                 0
state                0
zipcode              0
credit_card_limit    0
dtype: int64

In [None]:
tra_df = pd.read_csv('/content/drive/MyDrive/UnifiedMentor_Projects/Credit_Card_Fraud_Detection/transactions.csv')
tra_df.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719


In [None]:
tra_df.shape

(294588, 5)

In [None]:
#check for missing values of transactions file
tra_df.isnull().sum()

credit_card                  0
date                         0
transaction_dollar_amount    0
Long                         0
Lat                          0
dtype: int64

In [None]:
tra_df.dropna(inplace=True)

In [None]:
#merge two dataframes
mix_df = tra_df.merge(ccfd_df, on = 'credit_card')
mix_df.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,city,state,zipcode,credit_card_limit
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,Houston,PA,15342,20000
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,Houston,PA,15342,20000
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,Houston,PA,15342,20000
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,Houston,PA,15342,20000
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,Houston,PA,15342,20000


In [None]:
mix_df.shape

(294588, 9)

In [None]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294588 entries, 0 to 294587
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   credit_card                294588 non-null  int64  
 1   date                       294588 non-null  object 
 2   transaction_dollar_amount  294588 non-null  float64
 3   Long                       294588 non-null  float64
 4   Lat                        294588 non-null  float64
 5   city                       294588 non-null  object 
 6   state                      294588 non-null  object 
 7   zipcode                    294588 non-null  int64  
 8   credit_card_limit          294588 non-null  int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 20.2+ MB


In [None]:
# Check for missing values
mix_df.isnull().sum()

credit_card                  0
date                         0
transaction_dollar_amount    0
Long                         0
Lat                          0
city                         0
state                        0
zipcode                      0
credit_card_limit            0
dtype: int64

In [None]:
# Select relevant features for fraud detection
features = ['transaction_dollar_amount', 'Long', 'Lat', 'credit_card_limit']

In [None]:
# X will be the features, and for now, we'll assume no target variable (since it's unsupervised for anomaly detection)
X = mix_df[features]

# **Step 2: Handling Imbalanced Data**

In [None]:
# Generate random fraud labels (1 = fraud, 0 = not fraud) for the purpose of demonstration
np.random.seed(42)
y = np.random.randint(0, 2, size=mix_df.shape[0])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Check class distribution before and after SMOTE
print("Class distribution before SMOTE:", pd.Series(y_train).value_counts())
print("Class distribution after SMOTE:", pd.Series(y_train_resampled).value_counts())

Class distribution before SMOTE: 1    103151
0    103060
Name: count, dtype: int64
Class distribution after SMOTE: 1    103151
0    103151
Name: count, dtype: int64


# **Step 3: Feature Scaling**

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Display the first few rows of scaled data
print("First few rows of scaled data:\n", X_train_scaled[:5])

First few rows of scaled data:
 [[-0.31531852  0.19906627  0.41310056  0.56399123]
 [-0.08967979 -0.01657919 -0.06149617  0.31277237]
 [-0.50766498 -1.41026545 -0.37817379 -1.19454081]
 [-0.47628423  0.0089939   0.08160397 -0.06405593]
 [-0.24666318 -0.19423435 -0.13224542 -1.32015024]]


# **Step 4: Model Training and Evaluation**

## Random Forest Classifier (Supervised)

In [None]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, n_jobs=-1, random_state=42, class_weight='balanced')
rfc.fit(X_train_scaled, y_train_resampled)

In [None]:
# Predictions on test data
y_pred_rfc = rfc.predict(X_test_scaled)

In [None]:
# Evaluation of Random Forest Classifier
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_rfc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rfc))

Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.50      0.45      0.47     44111
           1       0.50      0.55      0.53     44266

    accuracy                           0.50     88377
   macro avg       0.50      0.50      0.50     88377
weighted avg       0.50      0.50      0.50     88377

Confusion Matrix:
 [[19914 24197]
 [19861 24405]]


## Isolation Forest (Unsupervised Anomaly Detection)

Since i don’t have actual fraud labels, unsupervised methods like Isolation Forest are perfect for detecting outliers or anomalies in my data

In [None]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train an Isolation Forest for anomaly detection
iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
iso_forest.fit(X_scaled)


In [None]:
# Predict anomalies (fraudulent transactions)
y_pred_iso = iso_forest.predict(X_scaled)

In [None]:
# Convert Isolation Forest output to binary labels (1 = anomaly/fraud, 0 = normal)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)

In [None]:
# Add the predicted labels to the original dataframe
mix_df['is_fraud'] = y_pred_iso

# Display the results
print("Anomalies detected (fraudulent transactions):")
print(mix_df[mix_df['is_fraud'] == 1].head())


Anomalies detected (fraudulent transactions):
          credit_card                 date  transaction_dollar_amount  \
14   1003715054175576  2015-09-18 21:44:21                     146.75   
128  1003715054175576  2015-10-25 00:11:25                      87.77   
141  1003715054175576  2015-09-25 20:51:42                      53.31   
306  1013870087888817  2015-10-06 00:34:58                      40.52   
484  1013870087888817  2015-08-07 17:47:29                      66.33   

           Long        Lat        city state  zipcode  credit_card_limit  \
14    69.475216  25.125749     Houston    PA    15342              20000   
128  102.032338 -32.343578     Houston    PA    15342              20000   
141   -7.373733 -11.785999     Houston    PA    15342              20000   
306  110.386103  52.238635  Washington    NH     3280              15000   
484   31.083820   0.500182  Washington    NH     3280              15000   

     is_fraud  
14          1  
128         1  
141       