<a href="https://colab.research.google.com/github/manthripranitha/IDP_1/blob/main/IDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
td = pd.read_csv('/content/online payment fraud transaction detection.csv')
print(td)

In [None]:
#(rows,columns)

td.shape

In [None]:
#data types

td.dtypes

In [None]:
td.isnull()

In [None]:
#finding the missing values

if td.isna().sum().sum() > 0:
    print(f'There are {td.isna().sum().sum()} missing values in the dataset\n')
    td = td.dropna()
    print('Shape after dropping missing values:', td.shape)
else:
    print('There are no missing values in the dataset.')


In [None]:
#handling the outliers

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Calculate Z-scores for all numeric columns
z_scores = np.abs(stats.zscore(td.select_dtypes(include=np.number)))

# Set a threshold to define outliers (e.g., z-score > 3)
threshold = 3
outliers = (z_scores > threshold).any(axis=1)

# Create figure and axis for plotting
plt.figure(figsize=(10, 6))

# Scatter plot: outliers vs non-outliers
# The 'data' argument should be used to pass the DataFrame 'td'
sns.scatterplot(data=td, x=td.index, y=td.select_dtypes(include=np.number).mean(axis=1), hue=outliers)
# Customize the plot
plt.title('Outliers Detected Using Z-Scores')
plt.xlabel('Index')
plt.ylabel('Mean Value of Numeric Features')
plt.grid(True)
plt.show()

In [None]:
td.columns

In [None]:
# type feature
td['type'].unique()

In [None]:
type_counts = td.groupby('type').size()
print(type_counts)


In [None]:
#barplot to display the type vs counts

# Count the occurrences of each type
type_counts = td['type'].value_counts()

# Create a bar plot
plt.figure(figsize=(7, 3))
plt.bar(type_counts.index, type_counts.values, color='skyblue')

# Add titles and labels
plt.title('Type vs Counts')
plt.xlabel('Type')
plt.ylabel('Counts')
plt.xticks(rotation=45)  # Rotate x labels for better readability
plt.grid(axis='y', alpha=0.5)

# Show the plot
plt.show()


In [None]:
pip install colorama


In [None]:
from colorama import Fore, Style

# Display the type that is used more and which is used less
most_common_type = type_counts.idxmax()
most_common_count = type_counts.max()
least_common_type = type_counts.idxmin()
least_common_count = type_counts.min()

# Print with colors
print(Fore.GREEN + f'Most Common Type: {most_common_type} (Count: {most_common_count})' + Style.RESET_ALL)
print(Fore.RED + f'Least Common Type: {least_common_type} (Count: {least_common_count})' + Style.RESET_ALL)

In [None]:


# Sample DataFrame for demonstration
data = pd.DataFrame({
    'type': ['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'CASH_OUT', 'PAYMENT']
})

# Convert types to numeric values using factorize
data['type_numeric'], unique_types = pd.factorize(data['type'])

# Get counts of each type after conversion
type_counts = data['type'].value_counts()

# Display the DataFrame with numeric values and the counts
print(data)
print("\nCounts of each type:")
print(type_counts)


In [None]:
# labels for isfraud
td['isFraud'].unique()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each target value
target_counts = td['isFraud'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 5))
plt.bar(target_counts.index, target_counts.values, color=['lightcoral', 'lightgreen'])

# Add titles and labels
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Target Counts \n (isn\'t Fraud = 0 || is Fraud = 1)')

# Add value annotations on top of the bars
for i, count in enumerate(target_counts.values):
    plt.text(i, count + 5, str(count), ha='center', fontsize=12)

# Set x-ticks to ensure they are labeled correctly
plt.xticks(target_counts.index, ['Not Fraud (0)', 'Fraud (1)'])

# Enable the grid on the y-axis for better readability
plt.grid(axis='y', alpha=0.7)

# Show the plot
plt.show()


In [None]:
#removing unnecessary columns

td.drop(['customer_starting_transaction', 'Recipient_transaction'], axis=1, inplace=True)
td.shape

In [None]:
#feature extraction

# Assuming 'type' is the problematic column in DataFrame 'td'
# Convert 'type' column to numeric using pd.factorize

# Create a mapping of type values to numeric values
type_mapping = {type_val: i for i, type_val in enumerate(td['type'].unique())}

# Apply the mapping to the 'type' column
td['type_numeric'] = td['type'].map(type_mapping)

# Now you can calculate the correlation, but only on numeric columns
# Select only numeric columns for correlation calculation
numeric_td = td.select_dtypes(include=['number'])

correlation_matrix = numeric_td.corr()
print(correlation_matrix)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the correlation matrix, but only on numeric columns
correlation_matrix = td.select_dtypes(include=np.number).corr()

# Create a figure and axis
plt.figure(figsize=(8, 8))

# Create a heatmap using imshow
cax = plt.imshow(correlation_matrix, interpolation='nearest', cmap='Spectral')

# Add a colorbar
plt.colorbar(cax)

# Set the ticks and labels
plt.xticks(np.arange(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(np.arange(len(correlation_matrix.columns)), correlation_matrix.columns)

# Add annotations to the heatmap
for (i, j), val in np.ndenumerate(correlation_matrix.values):
    plt.text(j, i, f"{val:.1f}", ha='center', va='center', color='black')

# Set title
plt.title('Correlation Matrix Heatmap')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame
correlation = td['oldbalance'].corr(td['newbalance'])

# Display the correlation
print(f"Correlation between 'oldbalance' and 'newbalance': {correlation}")


In [None]:
#line plot


# Sort data for line plot
data_sorted = td.sort_values('oldbalance')

# Create a line plot
plt.figure(figsize=(10, 6))
plt.plot(data_sorted['oldbalance'], data_sorted['newbalance'], marker='o', linestyle='-', color='b', label='Data')
plt.xlabel('oldbalance')
plt.ylabel('newbalance')
plt.title('Line Plot of oldbalance vs. newbalance')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# dropping unnecessary features
del td['isFlaggedFraud']
del td['step']
td.columns


In [None]:
del td['type_numeric']

In [None]:
td.shape


In [None]:
print(td)

In [None]:
td.describe().T

In [None]:
#splitting the data and target

# X Data
X = td.drop(['isFraud'], axis=1)
print('X shape is : ' , X.shape)
print()

# y Data
y = td['isFraud']
print('y shape is : ' , y.shape)

In [None]:
#splitting the dataset train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# Assuming X and y are defined
# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Splitted Data
shapes = (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print('Shapes of the splitted data (X_train, X_test, y_train, y_test):', shapes)


In [None]:
!pip install pandas
!pip install scikit-learn
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Assuming X and y are defined
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# Identify categorical and numerical features
# Assuming 'type' is the column containing 'PAYMENT' and other categorical values
categorical_features = ['type']  # Replace with your actual categorical columns
numerical_features = X_train.select_dtypes(include=['number']).columns

# Create a ColumnTransformer to apply OneHotEncoding to categorical features
# and MinMaxScaler to numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ])

# Fit and transform the data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Display the shape of scaled data
print('Shapes of the scaled data (X_train_scaled, X_test_scaled):', (X_train_scaled.shape, X_test_scaled.shape))

In [None]:
                                    #DECISION TREE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the Decision Tree Classifier
Model_DT = DecisionTreeClassifier()

# Fit the model on the scaled training data
Model_DT.fit(X_train_scaled, y_train)

# Predict on the scaled test data
y_pred_DT = Model_DT.predict(X_test_scaled)

# Evaluate using accuracy_score
Train_Accuracy = accuracy_score(y_train, Model_DT.predict(X_train_scaled))
Test_Accuracy = accuracy_score(y_test, y_pred_DT)

# Print results
print(f"Training Accuracy: {Train_Accuracy * 100:.2f} %")
print(f"Testing Accuracy: {Test_Accuracy * 100:.2f} %")


In [None]:
#confusion matrix

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
CM = confusion_matrix(y_true=y_test, y_pred=y_pred_DT)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(CM, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])

# Adding titles and labels
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_DT))

In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # Importing necessary functions
Accuracy_DT = accuracy_score(y_test, y_pred_DT)
print(f'➤➤➤ Accuracy Score : {Accuracy_DT * 100 : .2f} %\n')


# Precision = TP / (TP + FP)
Precision_DT = precision_score(y_test, y_pred_DT)
print(f'➤➤➤ Precision Score : {Precision_DT * 100 : .2f} %\n')


# Recall = TP / (TP + FN)
Recall_DT = recall_score(y_test, y_pred_DT)
print(f'➤➤➤ Recall Score : {Recall_DT * 100 : .2f} %\n')


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_DT = f1_score(y_test, y_pred_DT)
print(f'➤➤➤ F1 Score : {F1_Score_DT * 100 : .2f} %\n')


ROC_AUC_DT = roc_auc_score(y_test, y_pred_DT)
print(f'➤➤➤ AUC_ROC : {ROC_AUC_DT * 100 : .2f} %\n')

In [None]:
import matplotlib.pyplot as plt

# New variables for scores (you can assign new values as needed)
model_scores = [0.85, 0.75, 0.65, 0.70, 0.80]  # Example values for the metrics
score_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']  # Adjusted variable names

# Plot
plt.figure(figsize=(8, 5))
plt.bar(score_names, model_scores, color=['blue', 'orange', 'green', 'red', 'purple'])

# Add titles and labels
plt.title('Model Evaluation Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)  # Set y-axis limits
plt.axhline(y=0.5, color='gray', linestyle='--')  # Add a horizontal line for reference
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
                                                                         #Naive Bayes


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# 1. Split into train, validation, and holdout
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# ... (Your feature engineering code here) ...

# Assuming 'X' is your original feature data
# Create a scaler object
scaler = StandardScaler()

# Select only numerical features for scaling
numerical_features = X_train.select_dtypes(include=['number']).columns

# Fit the scaler on the numerical features of the training data and transform
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical_features]),
                               columns=numerical_features,
                               index=X_train.index)

# Transform the validation and holdout data using the trained scaler
X_val_scaled = pd.DataFrame(scaler.transform(X_val[numerical_features]),
                             columns=numerical_features,
                             index=X_val.index)

X_holdout_scaled = pd.DataFrame(scaler.transform(X_holdout[numerical_features]),
                                 columns=numerical_features,
                                 index=X_holdout.index)  # Scale the holdout data before prediction


# 2. Train and tune on train/validation
model = GaussianNB()
# Fit the model to the training data
model.fit(X_train_scaled, y_train) # This line is added to fit the model
# ... (Your hyperparameter tuning code here - use X_train_scaled, y_train, X_val_scaled, y_val) ...

# 3. Evaluate on the holdout set
y_pred_holdout = model.predict(X_holdout_scaled)
holdout_accuracy = accuracy_score(y_holdout, y_pred_holdout)
print(f"Holdout Accuracy: {holdout_accuracy}")

In [None]:
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_holdout, y_pred_holdout)

# Visualize the confusion matrix using Seaborn's heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Holdout Set')
plt.show()

In [None]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_features]),
                              columns=numerical_features,
                              index=X_test.index)
y_pred_NB = model.predict(X_test_scaled) # Generate the predictions using the NB model

# Print the classification report
print(classification_report(y_test, y_pred_NB))

In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # Importing necessary functions
Accuracy_NB = accuracy_score(y_test, y_pred_NB)
print(f'➤➤➤ Accuracy Score : {Accuracy_NB * 100 : .2f} %\n')


# Precision = TP / (TP + FP)
Precision_NB = precision_score(y_test, y_pred_NB)
print(f'➤➤➤ Precision Score : {Precision_NB * 100 : .2f} %\n')


# Recall = TP / (TP + FN)
Recall_NB = recall_score(y_test, y_pred_NB)
print(f'➤➤➤ Recall Score : {Recall_NB * 100 : .2f} %\n')


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_NB = f1_score(y_test, y_pred_NB)
print(f'➤➤➤ F1 Score : {F1_Score_NB * 100 : .2f} %\n')


ROC_AUC_NB = roc_auc_score(y_test, y_pred_NB)
print(f'➤➤➤ AUC_ROC : {ROC_AUC_NB * 100 : .2f} %\n')

In [None]:
Scores = [Accuracy_NB, Precision_NB, Recall_NB, F1_Score_NB, ROC_AUC_NB]
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.figure(figsize=(7, 5))
plt.pie(Scores, labels=Score_Names, autopct='%1.2f%%', startangle=140)
plt.axis('equal')
plt.show()


In [None]:

                                    #RANDOM FOREST




from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
Model_RF = RandomForestClassifier(random_state=42)

# Fit the model on the scaled training data
Model_RF.fit(X_train_scaled, y_train)

# Predict on the scaled test data
y_pred_RF = Model_RF.predict(X_test_scaled)

# Quick evaluation
Train_Accuracy_RF = Model_RF.score(X_train_scaled, y_train)
Test_Accuracy_RF = Model_RF.score(X_test_scaled, y_test)

# Print the accuracies
print(f'Training accuracy: {Train_Accuracy_RF * 100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy_RF * 100:.2f} %')


In [None]:
#confusion matrix

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
CM = confusion_matrix(y_true=y_test, y_pred=y_pred_RF)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(CM, annot=True, fmt='d', cmap='plasma', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])

# Adding titles and labels
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_RF))

In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # Importing necessary functions
Accuracy_RF = accuracy_score(y_test, y_pred_RF)
print(f'➤➤➤ Accuracy Score : {Accuracy_RF * 100 : .2f} %\n')


# Precision = TP / (TP + FP)
Precision_RF = precision_score(y_test, y_pred_RF)
print(f'➤➤➤ Precision Score : {Precision_RF * 100 : .2f} %\n')


# Recall = TP / (TP + FN)
Recall_RF = recall_score(y_test, y_pred_RF)
print(f'➤➤➤ Recall Score : {Recall_RF * 100 : .2f} %\n')


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_RF = f1_score(y_test, y_pred_RF)
print(f'➤➤➤ F1 Score : {F1_Score_RF * 100 : .2f} %\n')


ROC_AUC_RF = roc_auc_score(y_test, y_pred_RF)
print(f'➤➤➤ AUC_ROC : {ROC_AUC_RF * 100 : .2f} %\n')

In [None]:
Scores = [Accuracy_RF, Precision_RF, Recall_RF, F1_Score_RF, ROC_AUC_RF]
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.figure(figsize=(7, 5))
plt.pie(Scores, labels=Score_Names, autopct='%1.2f%%', startangle=140)
plt.axis('equal')
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
Model_LR = LogisticRegression()
Model_LR.fit(X_train_scaled, y_train)
y_pred_LR = Model_LR.predict(X_test_scaled)

# Quick evaluation
Train_Accuracy = Model_LR.score(X_train_scaled, y_train)
Test_Accuracy = Model_LR.score(X_test_scaled, y_test)
print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2f} %')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Assuming 'y_test' contains the true labels and has information about fraud status
# Get unique labels from y_test
display_labels =  list(set(y_test)) # or np.unique(y_test)

CM = confusion_matrix(y_true=y_test, y_pred=y_pred_LR)
ConfusionMatrixDisplay(CM, display_labels=display_labels).plot()
plt.title('Confusion Matrix')
plt.show()

In [None]:
Accuracy_LR = accuracy_score(y_test, y_pred_LR)
print(f'➤➤➤ Accuracy Score : {Accuracy_LR * 100 : .2f} %\n')


# Precision = TP / (TP + FP)
Precision_LR = precision_score(y_test, y_pred_LR)
print(f'➤➤➤ Precision Score : {Precision_LR * 100 : .2f} %\n')


# Recall = TP / (TP + FN)
Recall_LR = recall_score(y_test, y_pred_LR)
print(f'➤➤➤ Recall Score : {Recall_LR * 100 : .2f} %\n')


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_LR = f1_score(y_test, y_pred_LR)
print(f'➤➤➤ F1 Score : {F1_Score_LR * 100 : .2f} %\n')


ROC_AUC_LR = roc_auc_score(y_test, y_pred_LR)
print(f'➤➤➤ AUC_ROC : {ROC_AUC_LR * 100 : .2f} %\n')

In [None]:
Scores = [Accuracy_LR, Precision_LR, Recall_LR, F1_Score_LR, ROC_AUC_LR]
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.figure(figsize=(7, 5))
plt.pie(Scores, labels=Score_Names, autopct='%1.2f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
evaluation = pd.DataFrame({'Model': ['Decision Tree','Naive Bayes','Logistic Regression','Random forest'],
                           'Accuracy': [(Accuracy_DT*100),(Accuracy_NB*100),(Accuracy_LR*100), (Accuracy_RF*100)]})

evaluation