### Libraries, functions etc.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.special import expit

import sys
sys.path.insert(1, '../')

import measures
from model import LogReg
from preprocessing import Preprocessor
import experiments

### Loading data

In [3]:
etherneum_df=pd.read_csv('../data/transaction_dataset.csv')

y_eth=etherneum_df['FLAG']
to_drop=['Unnamed: 0','Index','Address','FLAG']
X_eth=etherneum_df.drop(columns=to_drop)

prep_eth = Preprocessor()

Basic data info:

In [4]:
etherneum_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

The dataset contains 50 columns out of which about 45 can be valid variables during modelling. About half of them contain missing values. We are going to deal with them by filling with the most frequent values.


In [16]:
X_eth_train, X_eth_test, y_eth_train, y_eth_test = prep_eth.train_test_split(X_eth, y_eth)

The target classes are imbalanced - data balancing may be needed:

In [17]:
y_eth_train.mean()

0.22140921409214093

In [18]:
for col in X_eth_train.columns:
    m_f_v = X_eth_train[col].value_counts().index[0]
    #train set
    X_eth_train[col]=X_eth_train[col].fillna(m_f_v)
    #test set
    X_eth_test[col]=X_eth_test[col].fillna(m_f_v)


In [20]:
X_train=X_eth_train.copy()
X_test=X_eth_test.copy()
y_train=y_eth_train.copy()
y_test=y_eth_test.copy()

#Coll. removal befor OHE -> fear of curse of dimensionality
X_train = prep_eth.remove_multicollinearity_fit_transform(X_train)
X_test = prep_eth.remove_multicollinearity_transform(X_test)

X_train = prep_eth.one_hot_encoding_fit_transform(X_train)
X_test = prep_eth.one_hot_encoding_transform(X_test)


X_train_rc=X_train.to_numpy()
y_train_rc=y_train.to_numpy()
X_test_rc=X_test.to_numpy()
y_test_rc=y_test.to_numpy()

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


38 numerical features left in dataset  2  categorical


#### 1. Convergence analysis: check how the value of log-likelihood function depends on the number of iterations for 4 above algorithms.

In [22]:
lr=0.01
n_epochs=1000

lr_models = {
     'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
     'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
     'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
     'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=0.01, beta_1=0.9,
                    beta_2=0.99, epsilon=1e-8)
 }

In [None]:
plt.figure(figsize=(16, 10))
losses={}
for model_name, model in lr_models.items():
    model.train(X_train_rc, y_train_rc)
    losses[model_name]=model.get_optimizer_training_losses()
    plt.plot(range(len(losses[model_name])), losses[model_name], label=model_name)
plt.title('ETH: All 4 implementations',fontsize='xx-large')
plt.xlabel("Iteration",fontsize='xx-large')
plt.ylabel("Loss",fontsize='xx-large')
plt.legend()
plt.savefig('ETH_conv_01.svg')
plt.show()

#### (1b) Impact of target balancing & data scaling

Only target balancing:

In [None]:
X_train_rc_balanced, y_train_rc_balanced = prep_eth.class_balancing(X_train_rc,y_train_rc)


In [None]:
lr=0.01
n_epochs=1000

lr_models_2 = {
     'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
     'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
     'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
     'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=0.01, beta_1=0.9,
                    beta_2=0.99, epsilon=1e-8)
 }

In [None]:
plt.figure(figsize=(16, 10))
losses={}
for model_name, model in lr_models_2.items():
    model.train(X_train_rc_balanced, y_train_rc_balanced)
    losses[model_name]=model.get_optimizer_training_losses()
    plt.plot(range(len(losses[model_name])), losses[model_name], label=model_name)
plt.title('ETH: All 4 implementations, target balanced',fontsize='xx-large')
plt.xlabel("Iteration",fontsize='xx-large')
plt.ylabel("Loss",fontsize='xx-large')
plt.legend()
plt.savefig('ETH_conv_02.svg')
plt.show()

Target balancing & data scaling

In [None]:
s = StandardScaler()
X_train_scaled = s.fit_transform(X_train_rc_balanced)
X_test_scaled = s.transform(X_test)

In [None]:
lr_models_3 = {
     'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
     'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
     'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
     'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=0.01, beta_1=0.9,
                    beta_2=0.99, epsilon=1e-8)
 }

In [None]:
plt.figure(figsize=(16, 10))
losses={}
for model_name, model in lr_models_3.items():
    model.train(X_train_scaled, y_train_scaled)
    losses[model_name]=model.get_optimizer_training_losses()
    plt.plot(range(len(losses[model_name])), losses[model_name], label=model_name)
plt.title('ETH: All 4 implementations, target balanced, data scaled',fontsize='xx-large')
plt.xlabel("Iteration",fontsize='xx-large')
plt.ylabel("Loss",fontsize='xx-large')
plt.savefig('ETH_conv_03.svg')
plt.legend()
plt.show()

Comparison of algorithms afficiency:

In [None]:
comp_df = pd.DataFrame(columns=['algorithm', 'transform', 'accuracy'])
for model_name in lr_models:
    acc_no_scal = measures.accuracy(lr_models[model_name].predict(X_test), y_test)
    acc_balanced = measures.accuracy(lr_models_2[model_name].predict(X_test), y_test)
    acc_scaled = measures.accuracy(lr_models_3[model_name].predict(X_test_scaled), y_test_scaled)
    comp_df = pd.concat((comp_df, pd.DataFrame({'algorithm': [model_name]*3,
                                                'transform': [None, 'Balancing', 'Balancing & scaling'],
                                                'accuracy': [acc_no_scal, acc_balanced, acc_scaled]})),
                        ignore_index=True)

In [None]:
plt.figure(figsize=(16, 10))
sns.barplot(x='algorithm', y='accuracy', data=comp_df, hue='transform')
plt.legend(loc=1, title='Data transformation')
plt.ylim(0, 1)
plt.title('ETH: data transformation impact', fontsize='xx-large')
plt.xlabel('Alogrithm', fontsize='xx-large')
plt.ylabel('Accuracy on test data', fontsize='xx-large')
plt.savefig('ETH_transformation_impact.svg')
plt.show()

#### 2. Check how the value of learning rate and other parameters affect the results.

In [None]:
tested_l_rates=np.linspace(start=0.2,stop=1e-5,num=11)
tested_algorithms={'GD': 'Gradient Descent','SGD':'Stochastic Gradient Descent','ADAM':'Adaptive Moment Estimation'}

res_test_learning_rates=experiments.test_learning_rates(X_train=X_train,
                                                        y_train=y_train,
                                                        X_test=X_test,
                                                        y_test=y_test,
                                                        l_rates=tested_l_rates,
                                                        algorithms=tested_algorithms)

res_test_learning_rates

In [None]:
res_test_learning_rgfates[res_test_learning_rates['method']=='ADAM'].sort_values(by=['accuracy','F_measure','recall','precision'],ascending=False).head(3)

In [None]:
tested_betas_1=np.linspace(start=0.75,stop=.97,num=12)
tested_betas_2=np.linspace(start=0.90,stop=0.99,num=10)

res_test_betas=experiments.test_betas(X_train=X_train, 
                                      y_train=y_train,
                                      X_test=X_test,
                                      y_test=y_test,
                                      tested_betas1=tested_betas_1,
                                      tested_betas2=tested_betas_2)

res_test_betas.sort_values(by=['accuracy','F_measure','recall','precision'],ascending=False).head(10)

#### 4. Compare the classification performance of logistic regression (try all 4 methods: IWLS, GD, SGD and ADAM) and 3 popular classification methods: LDA, QDA and KNN. Use the performance measures implemented in Part 2 and datasets prepared in Part 1. The performance measures should be calculated on test set. If the given algorithm does not converge, within 1000 iterations, stop the algorithm and use the solutions from the last iteration.

In [None]:
lr=0.001
n_epochs=1000
models = {
    'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
    'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
    'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
    'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=1e-3, beta_1=0.75,
                   beta_2=0.99, epsilon=1e-8),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'LR': LogisticRegression(max_iter=n_epochs),
    'kNN': KNeighborsClassifier()
}

In [None]:
res_final=experiments.final_comparisson(X_train=X_train,
                                        y_train=y_train,
                                        X_test=X_test,
                                        y_test=y_test,
                                        models=models)

res_final.sort_values(by=['accuracy','f_measure','recall','precision'],ascending=False)