### Libraries, functions etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.special import expit

import sys
sys.path.insert(1, '../')

import measures
from model import LogReg
from preprocessing import Preprocessor
import experiments

### Loading data

In [None]:
etherneum_df=pd.read_csv('data/transaction_dataset.csv')

y_eth=etherneum_df['FLAG']
to_drop=['Unnamed: 0','Index','Address','FLAG']
X_eth=etherneum_df.drop(columns=to_drop)

prep_eth = Preprocessor()

In [None]:
X_eth_train, X_eth_test, y_eth_train, y_eth_test = prep_eth.train_test_split(X_eth, y_eth)

In [None]:
columns_to_binary=[' ERC20 avg time between sent tnx', ' ERC20 avg time between rec tnx',
                 ' ERC20 avg time between rec 2 tnx', ' ERC20 avg time between contract tnx',
                 ' ERC20 min val sent contract', ' ERC20 max val sent contract', ' ERC20 avg val sent contract']
for col in columns_to_binary:
    #train set
    X_eth_train[col]=X_eth_train[col].fillna(1)
    #test set
    X_eth_test[col]=X_eth_test[col].fillna(1)
#Pomimo obecności SimpleImputera w preprocessorze przy użyciu go było za dużo zabawy z dtype i faktem, że imputer 
#zwraca NumpyArray a nie DataFrame
for col in set(X_eth_train.columns)-set(columns_to_binary):
    #train set
    X_eth_train[col]=X_eth_train[col].fillna(X_eth_train[col].value_counts().index[0])
    #test set
    X_eth_test[col]=X_eth_test[col].fillna(X_eth_train[col].value_counts().index[0])


In [None]:
remove_coll,balance_classes,scaling=True,True,False
X_train=X_eth_train.copy()
X_test=X_eth_test.copy()
y_train=y_eth_train.copy()
y_test=y_eth_test.copy()
if remove_coll:
    X_train = prep_eth.remove_multicollinearity_fit_transform(X_train)
    X_test = prep_eth.remove_multicollinearity_transform(X_test)
if balance_classes:
    X_train,y_train=prep_eth.class_balancing(X_train,y_train)
if scaling:
    s = StandardScaler()
    X_train = s.fit_transform(X_train)
    X_test = s.transform(X_test)
#próbowałem to wrzucić do funkcji preprocess_data, ale coś wywala błąd więc na razie odpuszczam    
    
#One Hot Encoding
X_train = prep_eth.one_hot_encoding_fit_transform(X_train)
X_test = prep_eth.one_hot_encoding_transform(X_test)

X_train=X_train.to_numpy()
y_train=y_train.to_numpy()
X_test=X_test.to_numpy()
y_test=y_test.to_numpy()

#### 1. Convergence analysis: check how the value of log-likelihood function depends on the number of iterations for 4 above algorithms.

In [None]:
lr=0.01
n_epochs=1000

lr_models = {
     'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
     'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
     'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
     'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=0.01, beta_1=0.9,
                    beta_2=0.99, epsilon=1e-8)
 }

In [None]:
plt.figure(figsize=(16, 10))
losses={}
for model_name, model in lr_models.items():
    model.train(X_train, y_train)
    losses[model_name]=model.get_optimizer_training_losses()
    plt.plot(range(len(losses[model_name])), losses[model_name], label=model_name)
plt.title('All 4 implementations',fontsize='xx-large')
plt.xlabel("Iteration",fontsize='xx-large')
plt.ylabel("Loss",fontsize='xx-large')
plt.legend()
plt.show()

#### 2. Check how the value of learning rate and other parameters affect the results.

In [None]:
tested_l_rates=np.linspace(start=0.2,stop=1e-5,num=11)
tested_algorithms={'GD': 'Gradient Descent','SGD':'Stochastic Gradient Descent','ADAM':'Adaptive Moment Estimation'}

res_test_learning_rates=experiments.test_learning_rates(X_train=X_train,
                                                        y_train=y_train,
                                                        X_test=X_test,
                                                        y_test=y_test,
                                                        l_rates=tested_l_rates,
                                                        algorithms=tested_algorithms)

res_test_learning_rates

In [None]:
res_test_learning_rgfates[res_test_learning_rates['method']=='ADAM'].sort_values(by=['accuracy','F_measure','recall','precision'],ascending=False).head(3)

In [None]:
tested_betas_1=np.linspace(start=0.75,stop=.97,num=12)
tested_betas_2=np.linspace(start=0.90,stop=0.99,num=10)

res_test_betas=experiments.test_betas(X_train=X_train, 
                                      y_train=y_train,
                                      X_test=X_test,
                                      y_test=y_test,
                                      tested_betas1=tested_betas_1,
                                      tested_betas2=tested_betas_2)

res_test_betas.sort_values(by=['accuracy','F_measure','recall','precision'],ascending=False).head(10)

#### 4. Compare the classification performance of logistic regression (try all 4 methods: IWLS, GD, SGD and ADAM) and 3 popular classification methods: LDA, QDA and KNN. Use the performance measures implemented in Part 2 and datasets prepared in Part 1. The performance measures should be calculated on test set. If the given algorithm does not converge, within 1000 iterations, stop the algorithm and use the solutions from the last iteration.

In [None]:
lr=0.001
n_epochs=1000
models = {
    'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
    'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
    'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
    'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=1e-3, beta_1=0.75,
                   beta_2=0.99, epsilon=1e-8),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'LR': LogisticRegression(max_iter=n_epochs),
    'kNN': KNeighborsClassifier()
}

In [None]:
res_final=experiments.final_comparisson(X_train=X_train,
                                        y_train=y_train,
                                        X_test=X_test,
                                        y_test=y_test,
                                        models=models)

res_final.sort_values(by=['accuracy','f_measure','recall','precision'],ascending=False)