# Fraudulent Cryptocurrency Transaction Detection

This notebook analyzes a cryptocurrency dataset to detect fraudulent transactions.

In [1]:
## Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

## Load Dataset

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Path to the pickle file
csv_file_path = '/content/drive/MyDrive/TME/Data/Assignment-dataset.csv'

# Load the Pickle file into a Pandas DataFrame
with open(csv_file_path, 'rb') as file:
    data = pd.read_csv(csv_file_path)

data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Class,a1,a2,a3,a4,a5,a6,a7,a8,a9,...,a38,a39,a40,a41,a42,a43,a44,a45,a46,a47
0,False,844.26,1093.71,704785.63,721,89,0,40,118,0.0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,False,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,False,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,False,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,False,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 48 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Class   9841 non-null   bool   
 1   a1      9841 non-null   float64
 2   a2      9841 non-null   float64
 3   a3      9841 non-null   float64
 4   a4      9841 non-null   int64  
 5   a5      9841 non-null   int64  
 6   a6      9841 non-null   int64  
 7   a7      9841 non-null   int64  
 8   a8      9841 non-null   int64  
 9   a9      9841 non-null   float64
 10  a10     9841 non-null   float64
 11  a11     9841 non-null   float64
 12  a12     9841 non-null   float64
 13  a13     9841 non-null   float64
 14  a14     9841 non-null   float64
 15  a15     9841 non-null   float64
 16  a16     9841 non-null   float64
 17  a17     9841 non-null   float64
 18  a18     9841 non-null   int64  
 19  a19     9841 non-null   float64
 20  a20     9841 non-null   float64
 21  a21     9841 non-null   float64
 22  

## Handle Missing Values

In [4]:
# Fill missing numerical values with median
for col in data.columns:
    if data[col].dtype in ['float64', 'int64']:
        data[col].fillna(data[col].median(), inplace=True)
    else:
        data[col].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 48 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Class   9841 non-null   bool   
 1   a1      9841 non-null   float64
 2   a2      9841 non-null   float64
 3   a3      9841 non-null   float64
 4   a4      9841 non-null   int64  
 5   a5      9841 non-null   int64  
 6   a6      9841 non-null   int64  
 7   a7      9841 non-null   int64  
 8   a8      9841 non-null   int64  
 9   a9      9841 non-null   float64
 10  a10     9841 non-null   float64
 11  a11     9841 non-null   float64
 12  a12     9841 non-null   float64
 13  a13     9841 non-null   float64
 14  a14     9841 non-null   float64
 15  a15     9841 non-null   float64
 16  a16     9841 non-null   float64
 17  a17     9841 non-null   float64
 18  a18     9841 non-null   int64  
 19  a19     9841 non-null   float64
 20  a20     9841 non-null   float64
 21  a21     9841 non-null   float64
 22  

## Encode Categorical Variables

In [6]:
# Encode categorical features
le = LabelEncoder()
for col in ['a46', 'a47']:
    data[col] = le.fit_transform(data[col])


## Encode Target Variable

In [7]:
# Encode target variable 'Class' from boolean to integer
data['Class'] = data['Class'].astype(int)

## Feature Selection

### 1. Gini Importance (Random Forest)

In [8]:
# Gini importance feature selection
X = data.drop('Class', axis=1)
y = data['Class']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
gini_features = importances.sort_values(ascending=False).head(10).index.tolist()
gini_data = data[gini_features + ['Class']]
gini_data.to_csv('/content/drive/MyDrive/TME/Data/gini_features.csv', index=False)


### 2. Correlation Method

In [9]:
# Correlation feature selection excluding Class
corr_matrix = data.corr()
top_corr_features = corr_matrix['Class'].abs().sort_values(ascending=False).drop('Class').head(10).index.tolist()
correlation_data = data[top_corr_features + ['Class']]
correlation_data.to_csv('/content/drive/MyDrive/TME/Data/correlation_features.csv', index=False)


### 3. Random manual selection

In [10]:
# Manual selection of features
manual_features = ['a1', 'a4', 'a6', 'a9', 'a31']
manual_data = data[manual_features + ['Class']]
manual_data.to_csv('/content/drive/MyDrive/TME/Data/manual_features.csv', index=False)


## Machine Learning Models

### Define Function to Train and Evaluate Models

In [11]:
def train_evaluate(df, feature_set_name):
    X = df.drop('Class', axis=1)
    y = df['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    models = {
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM': SVC(random_state=42)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        results[name] = {'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
                        'Accuracy': accuracy, 'Precision': precision, 'Recall': recall}

    results_df = pd.DataFrame(results).T
    results_df.to_csv(f'results_{feature_set_name}.csv')
    return results_df


### Train and Evaluate on Gini Features

In [12]:
gini_results = train_evaluate(gini_data, 'gini')
gini_results

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,TP,FP,TN,FN,Accuracy,Precision,Recall
Decision Tree,622.0,28.0,2275.0,28.0,0.981036,0.956923,0.956923
Random Forest,618.0,3.0,2300.0,32.0,0.988148,0.995169,0.950769
SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0


### *Train* and Evaluate on Correlation Features

In [13]:
corr_results = train_evaluate(correlation_data, 'correlation')
corr_results


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,TP,FP,TN,FN,Accuracy,Precision,Recall
Decision Tree,630.0,25.0,2278.0,20.0,0.984761,0.961832,0.969231
Random Forest,620.0,4.0,2299.0,30.0,0.988486,0.99359,0.953846
SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0


### Train and Evaluate on Manual Features

In [14]:
manual_results = train_evaluate(manual_data, 'manual')
manual_results


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,TP,FP,TN,FN,Accuracy,Precision,Recall
Decision Tree,390.0,240.0,2063.0,260.0,0.830681,0.619048,0.6
Random Forest,391.0,166.0,2137.0,259.0,0.856079,0.701975,0.601538
SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0


## Combine All Results

In [15]:
# Add feature set information
manual_results['Feature Set'] = 'Manual'
gini_results['Feature Set'] = 'Gini'
corr_results['Feature Set'] = 'Correlation'

# Combine all results
all_results = pd.concat([manual_results, gini_results, corr_results])
all_results = all_results.reset_index().rename(columns={'index': 'Model'})
all_results


Unnamed: 0,Model,TP,FP,TN,FN,Accuracy,Precision,Recall,Feature Set
0,Decision Tree,390.0,240.0,2063.0,260.0,0.830681,0.619048,0.6,Manual
1,Random Forest,391.0,166.0,2137.0,259.0,0.856079,0.701975,0.601538,Manual
2,SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0,Manual
3,Decision Tree,622.0,28.0,2275.0,28.0,0.981036,0.956923,0.956923,Gini
4,Random Forest,618.0,3.0,2300.0,32.0,0.988148,0.995169,0.950769,Gini
5,SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0,Gini
6,Decision Tree,630.0,25.0,2278.0,20.0,0.984761,0.961832,0.969231,Correlation
7,Random Forest,620.0,4.0,2299.0,30.0,0.988486,0.99359,0.953846,Correlation
8,SVM,0.0,0.0,2303.0,650.0,0.779885,0.0,0.0,Correlation


## Select Best Model

In [16]:
# Find the best model based on Accuracy
best_row = all_results.loc[all_results['Accuracy'].idxmax()]
print(f"Best Model: {best_row['Model']} with feature set {best_row['Feature Set']} having Accuracy: {best_row['Accuracy']:.2f}, Precision: {best_row['Precision']:.2f}, Recall: {best_row['Recall']:.2f}")


Best Model: Random Forest with feature set Correlation having Accuracy: 0.99, Precision: 0.99, Recall: 0.95
