# Trained using Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score,log_loss
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# read in the dataset
df = pd.read_csv('../Dataset/card-transaction-dataset.csv')

In [None]:
# encoding categorical data
y = df['Class']

label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(y)
print(Y[500:600])

In [None]:
X = df.drop(labels = ['Time','Class'],axis=1)
print(X.head)

In [None]:
# Normalize the data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
param_grid = {
    'max_features' : [1,2,3,4,5],
    'n_estimators' : [10,25,50,75,100]
}

In [None]:
from sklearn.model_selection import GridSearchCV

model = RF()
gs = GridSearchCV(model,param_grid,scoring='neg_log_loss',cv=5)
gs.fit(X,Y)
print('Best params: ',gs.best_params_)

# Elbow graph
With a parameter like the number of trees in a random forest, increasing the number of trees will never hurt performance. Increasing the number trees will increase performance until a point where it levels out.

In [None]:
n_estimators = list(range(25,101))
param_grid = {
    'n_estimators' : n_estimators
}
new_model = RF()
gs = GridSearchCV(new_model,param_grid,scoring='neg_log_loss',cv=5)
gs.fit(X,Y)

In [None]:
scores = gs.cv_results_['mean_test_score']

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.plot(n_estimators,scores)
plt.xlabel('n_estimators')
plt.ylabel('scores')
plt.xlim(25,100)
plt.ylim(np.min(scores), np.max(scores))
plt.show()

In [None]:
# Split dataset into train test chucks
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=6)

In [None]:
rf_model = RF(n_estimators=100,max_features=2)
start = datetime.now()
rf_model.fit(X_train,Y_train)
stop = datetime.now()
train_time = stop - start
print("Training duration :",train_time)

In [None]:
Y_pred = rf_model.predict(X_test)

In [None]:
# create the confusion matrix
cf = confusion_matrix(Y_test,Y_pred)
print(cf)
precision,recall,fscore,_ = precision_recall_fscore_support(Y_test,Y_pred)


print("Performance metrics")
print("Precission :",precision)
print("Recall :",recall)
print("F1-score :",fscore)
print("Accuracy:",accuracy_score(Y_test,Y_pred))
print("Log loss :",log_loss(Y_test,Y_pred))
sns.heatmap(cf,annot=True)