<a href="https://colab.research.google.com/github/neha-369-tes/CODSOFT-/blob/main/CreditCardFraudDetectionByNeha2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer # Import the imputer

# Load the datasets
df = pd.read_csv("/content/fraudTrain.csv")
df.drop(columns=["Unnamed: 0", "trans_num", "street"], inplace=True)

# Sample data for training
data = df.head(n=20000)

# Preprocess the data
df_processed = pd.get_dummies(data=data)
x_train = df_processed.drop(columns='is_fraud', axis=1)
y_train_filled = df_processed['is_fraud'].fillna(value=0)

# Impute missing values in x_train (replace NaNs with the most frequent value)
imputer = SimpleImputer(strategy='most_frequent') # Create an imputer object
x_train_imputed = pd.DataFrame(imputer.fit_transform(x_train), columns=x_train.columns) # Fit and transform x_train

# Load the test data
df_test = pd.read_csv("/content/fraudTest.csv")
df_test.drop(columns=["Unnamed: 0", "trans_num", "street"], inplace=True)

# Sample data for testing
data_test = df_test.sample(frac=1, random_state=1).reset_index(drop=True).head(n=5000)

# Preprocess the test data
df_processed_test = pd.get_dummies(data=data_test)

# Align the columns in train and test datasets
x_test = df_processed_test.drop(columns='is_fraud', axis=1).reindex(columns=x_train.columns, fill_value=0)
y_test_filled = df_processed_test['is_fraud'].fillna(value=0)

# Impute missing values in x_test (using the same imputer fitted on x_train)
x_test_imputed = pd.DataFrame(imputer.transform(x_test), columns=x_test.columns) # Transform x_test

# Logistic Regression (use imputed data)
LR = LogisticRegression(solver='liblinear')
LR.fit(x_train_imputed, y_train_filled) # Fit with imputed data
predictions_lr = LR.predict(x_test_imputed) # Predict using imputed data
LR_Accuracy_Score = accuracy_score(y_test_filled, predictions_lr)
print("Logistic Regression Accuracy Score:", LR_Accuracy_Score)

# Decision Tree Classifier (use imputed data)
Tree = DecisionTreeClassifier()
Tree.fit(x_train_imputed, y_train_filled) # Fit with imputed data
predictions_tree = Tree.predict(x_test_imputed) # Predict using imputed data
Tree_Accuracy_Score = accuracy_score(y_test_filled, predictions_tree)
print("Decision Tree Accuracy Score:", Tree_Accuracy_Score)

# RandomForest Classifier (use imputed data)
ranr = RandomForestClassifier(n_estimators=1000, random_state=42)
ranr.fit(x_train_imputed, y_train_filled) # Fit with imputed data
predictions_rf = ranr.predict(x_test_imputed) # Predict using imputed data

# Evaluate the RandomForest predictions
mse = mean_squared_error(y_test_filled, predictions_rf)
mae = mean_absolute_error(y_test_filled, predictions_rf)
r2 = r2_score(y_test_filled, predictions_rf)

print("Random Forest Mean Squared Error:", mse)
print("Random Forest Mean Absolute Error:", mae)
print("Random Forest R-squared Score:", r2)

Logistic Regression Accuracy Score: 0.9964
Decision Tree Accuracy Score: 0.9952
Random Forest Mean Squared Error: 0.0056
Random Forest Mean Absolute Error: 0.0056
Random Forest R-squared Score: -0.5611757883937736
