## PERFORMANCE IN CLASSIFIERS

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
#!gdown https://drive.google.com/uc?id=1emftJouxZQrMESgqmO2Wg2vKknWtQfG1
#!gdown https://drive.google.com/uc?id=1hfObv_kxZDCZkL-uC2NSwra3gCpHMuaH
#!gdown https://drive.google.com/uc?id=15KbdNEUvv-f3u_TdWMINl2vTyxY8NLHF
#!gdown https://drive.google.com/uc?id=1iXVQ1e9OGdDT8xem6DoNNwm_yBE3XD2u
#!gdown https://drive.google.com/uc?id=17AD72lSaGJeQQBmKkvPr-vhO_eO3roLT

### CLASSIFICATION MEAN_APPROACH (STATISTICAL)

In [4]:
mean_approach  = pd.read_csv('mean_approach.csv')

y = mean_approach["isFraud"]
X = mean_approach.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=6969)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  after removing the cwd from sys.path.


Training target statistics: Counter({0.0: 106375, 1.0: 9011})
Testing target statistics: Counter({0.0: 26540, 1.0: 2307})


In [5]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     26540
         1.0       0.89      0.16      0.27      2307

    accuracy                           0.93     28847
   macro avg       0.91      0.58      0.62     28847
weighted avg       0.93      0.93      0.91     28847



### CLASSIFICATION MODE_APPROACH (STATISTICAL)

In [7]:
mode_approach  = pd.read_csv('mode_approach.csv')

y = mode_approach["isFraud"]
X = mode_approach.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  after removing the cwd from sys.path.


Training target statistics: Counter({0.0: 106303, 1.0: 9083})
Testing target statistics: Counter({0.0: 26612, 1.0: 2235})


In [8]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     26612
         1.0       0.90      0.17      0.29      2235

    accuracy                           0.93     28847
   macro avg       0.92      0.59      0.63     28847
weighted avg       0.93      0.93      0.91     28847



### CLASSIFICATION UNDERCOMPLETE

In [10]:
undercomplete  = pd.read_csv('pd_undercomplete.csv')

y = undercomplete["isFraud"].apply(np.round).abs()
X = undercomplete.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  after removing the cwd from sys.path.


Training target statistics: Counter({0.0: 106362, 1.0: 9024})
Testing target statistics: Counter({0.0: 26625, 1.0: 2222})


In [11]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99     26625
         1.0       1.00      0.64      0.78      2222

    accuracy                           0.97     28847
   macro avg       0.99      0.82      0.88     28847
weighted avg       0.97      0.97      0.97     28847



### CLASSIFICATION REGULARISED

In [13]:
regularized  = pd.read_csv('pd_regularized.csv')

y = regularized["isFraud"].apply(np.round).abs()
X = regularized.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  after removing the cwd from sys.path.


Training target statistics: Counter({0.0: 106310, 1.0: 9076})
Testing target statistics: Counter({0.0: 26615, 1.0: 2232})


In [14]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     26615
         1.0       0.96      0.46      0.62      2232

    accuracy                           0.96     28847
   macro avg       0.96      0.73      0.80     28847
weighted avg       0.96      0.96      0.95     28847



### CLASSIFICATION VARIATIONAL

In [16]:
variational  = pd.read_csv('pd_variational.csv')
variational = variational.dropna()


y = variational["isFraud"].apply(np.round).abs()
X = variational.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  


Training target statistics: Counter({0.0: 106809, 1.0: 9086})
Testing target statistics: Counter({0.0: 26725, 1.0: 2249})


In [17]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     26725
         1.0       0.94      0.46      0.62      2249

    accuracy                           0.96     28974
   macro avg       0.95      0.73      0.80     28974
weighted avg       0.96      0.96      0.95     28974



# LDA vs PCA

In [24]:
data  = pd.read_csv('pd_undercomplete.csv')

y = data["isFraud"].apply(np.round).abs()
X = data.drop("isFraud", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

  after removing the cwd from sys.path.


Training target statistics: Counter({0.0: 106362, 1.0: 9024})
Testing target statistics: Counter({0.0: 26625, 1.0: 2222})


## LDA

In [36]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     26625
         1.0       1.00      0.96      0.98      2222

    accuracy                           1.00     28847
   macro avg       1.00      0.98      0.99     28847
weighted avg       1.00      1.00      1.00     28847



## PCA

In [37]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
pca.fit(X_train)
print(sum(pca.explained_variance_ratio_))
X_tra = pca.transform(X_train)
X_te = pca.transform(X_test)

0.9998174178415913


In [38]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_tra, y_train)
y_pred = clf.predict(X_te)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     26625
         1.0       1.00      0.12      0.22      2222

    accuracy                           0.93     28847
   macro avg       0.97      0.56      0.59     28847
weighted avg       0.94      0.93      0.91     28847



In [None]:
# In this case, for the unbalanced data the best metric to 
# take a look at is the recall, here the difference is huge
# concluding that LDA method is more suitable than PCA.