<a href="https://colab.research.google.com/github/nik2043/Codsoft/blob/main/credit_card_fraud_detection_codsoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 💼 Credit Card Fraud Detection (CodSoft Internship Project)

# ✅ Objective:
# Detect fraudulent credit card transactions using Logistic Regression, Decision Tree, and Random Forest.


In [1]:
# 🧰 Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
# 📥 Step 1: Upload Dataset
from google.colab import files
uploaded = files.upload()


Saving fraud_detection_credit_card_small.csv to fraud_detection_credit_card_small.csv


In [3]:
# Load Dataset
df = pd.read_csv('fraud_detection_credit_card_small.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,merch_long,is_fraud,merch_zipcode,Merchant_Category,Transaction_Type,Customer_Satisfaction_Score,Transaction_Time,Customer_Age,Payment_Method,Loyalty_Points_Earned
0,1045211,2020-03-09 15:09:26,577588686219,fraud_Towne LLC,misc_pos,194.51,James,Strickland,M,25454 Leonard Lake,...,-78.865012,0,15909.0,Electronics,In-store,6,11:10,69,Debit Card,332
1,547406,2019-08-22 15:49:01,30376238035123,fraud_Friesen Ltd,health_fitness,52.32,Cynthia,Davis,F,7177 Steven Forges,...,-123.636337,0,,Entertainment,Online,8,20:42,47,Mobile Payment,476
2,110142,2019-03-04 01:34:16,4658490815480264,fraud_Mohr Inc,shopping_pos,6.53,Tara,Richards,F,4879 Cristina Station,...,-78.89819,0,15961.0,Restaurants,Online,3,05:19,69,Debit Card,100
3,1285953,2020-06-16 20:04:38,3514897282719543,fraud_Gaylord-Powlowski,home,7.33,Steven,Faulkner,M,841 Cheryl Centers Suite 115,...,-76.542384,0,,Restaurants,Online,4,06:12,31,Mobile Payment,460
4,271705,2019-05-14 05:54:48,6011381817520024,"fraud_Christiansen, Goyette and Schamberger",gas_transport,64.29,Kristen,Allen,F,8619 Lisa Manors Apt. 871,...,-104.092324,0,82082.0,Clothing,In-store,5,01:01,44,Mobile Payment,372


In [9]:
# 📊 Step 2: Preprocessing
print("Class distribution:\n", df['is_fraud'].value_counts())

# Convert 'trans_date_trans_time' to numerical (Unix timestamp)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']).astype(np.int64) // 10**9

# Normalize 'Amount'
scaler = StandardScaler()
df['amt'] = scaler.fit_transform(df[['amt']])

# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
print(f"\nNon-numeric columns before dropping: {list(non_numeric_cols)}")

# Drop non-numeric columns (except 'is_fraud' which is the target)
columns_to_drop = non_numeric_cols.drop('is_fraud', errors='ignore')
df_processed = df.drop(columns=columns_to_drop)

# Handle missing values in 'merch_zipcode' by imputing with the median
if 'merch_zipcode' in df_processed.columns:
    median_merch_zipcode = df_processed['merch_zipcode'].median()
    df_processed['merch_zipcode'].fillna(median_merch_zipcode, inplace=True)


# Split features and labels
X = df_processed.drop('is_fraud', axis=1)
y = df_processed['is_fraud']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Class distribution:
 is_fraud
0    99402
1      598
Name: count, dtype: int64

Non-numeric columns before dropping: ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num', 'Merchant_Category', 'Transaction_Type', 'Transaction_Time', 'Payment_Method']

Shape of X_train: (80000, 15)
Shape of X_test: (20000, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['merch_zipcode'].fillna(median_merch_zipcode, inplace=True)


In [10]:
# 🤖 Step 3: Train Models

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


In [12]:
# 📈 Step 4: Evaluation

def evaluate(name, y_true, y_pred):
    print(f"\n{name} Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

# Evaluate all models
evaluate("Logistic Regression", y_test, lr_pred)
evaluate("Decision Tree", y_test, dt_pred)
evaluate("Random Forest", y_test, rf_pred)



Logistic Regression Performance:
Accuracy: 0.994
Confusion Matrix:
 [[19880     0]
 [  120     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     19880
           1       0.00      0.00      0.00       120

    accuracy                           0.99     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.99      0.99      0.99     20000


Decision Tree Performance:
Accuracy: 0.98995
Confusion Matrix:
 [[19766   114]
 [   87    33]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     19880
           1       0.22      0.28      0.25       120

    accuracy                           0.99     20000
   macro avg       0.61      0.63      0.62     20000
weighted avg       0.99      0.99      0.99     20000


Random Forest Performance:
Accuracy: 0.99455
Confusion Matrix:
 [[19876     4]
 [  105    15]]
Classi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# ✅ Conclusion:
# - Logistic Regression provides a good baseline.
# - Decision Tree can overfit on imbalanced data.
# - Random Forest generally performs better and handles imbalance more robustly.
# - Consider using SMOTE or undersampling for better fraud detection performance in real-world applications.
