### Libraries, functions etc.

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.special import expit

import measures
from model import LogReg
from preprocessing import Preprocessor

In [11]:
lr=0.01
n_epochs=1000
lr_models = {
    'GD': LogReg(optimization='Gradient Descent', learning_rate=lr, epochs=n_epochs, batch_size=32),
    'SGD': LogReg(optimization='Stochastic Gradient Descent', learning_rate=lr, epochs=n_epochs),
    'IRLS': LogReg(optimization='Iterative Reweighted Least Squares', epochs=n_epochs),
    'ADAM': LogReg(optimization='Adaptive Moment Estimation', epochs=n_epochs, learning_rate=0.01, beta_1=0.9,
                   beta_2=0.99, epsilon=1e-8)
}
sklearn_models = {
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'LR': LogisticRegression(max_iter=1000),
    'kNN': KNeighborsClassifier()
}

### Loading data

In [12]:
wdbc_df=pd.read_csv('data/wdbc.csv')

y_wdbc=wdbc_df['diagnosis']=="M"
X_wdbc=wdbc_df.drop(columns=["id","diagnosis","Unnamed: 32"])

prep_wdbc = Preprocessor()

In [13]:
wdbc_train, wdbc_test, y_wdbc_train, y_wdbc_test = prep_wdbc.train_test_split(X_wdbc, y_wdbc)

In the task requierements we're asked to remove collinear variables, so we're left with only first 4 options. The GD and SGD results for these 4 are very similar (close to 63%). Thats why I would suggest taking the one where the remaining 2 achieve the best results (which in this case is no scaling and no target balancing). The remaining tests will be perfomed in this particular situation.

In [22]:
remove_coll,balance_classes,scaling=True,True,False
X_wdbc_train_exp,X_wdbc_test_exp,y_wdbc_train_exp=wdbc_train.copy(),wdbc_test.copy(),y_wdbc_train.copy()

if remove_coll:
    X_wdbc_train_exp=prep_wdbc.remove_multicollinearity_fit_transform(X_wdbc_train_exp)
    X_wdbc_test_exp = prep_wdbc.remove_multicollinearity_transform(X_wdbc_test_exp)
if balance_classes:
    X_wdbc_train_exp,y_wdbc_train_exp=prep_wdbc.class_balancing(X_wdbc_train_exp,y_wdbc_train_exp)
if scaling:
    s = StandardScaler()
    X_wdbc_train_exp=s.fit_transform(X_wdbc_train_exp)
    X_wdbc_test_exp=s.transform(X_wdbc_test_exp)


7 numerical features left in dataset  0  categorical
Training dataset has now  534 obervations


In [23]:
for m_name, m in sklearn_models.items():
    m.fit(X_wdbc_train_exp, y_wdbc_train_exp)
    print(m_name)
    print('Test acc', (measures.accuracy(m.predict(X_wdbc_test_exp.to_numpy()), y_wdbc_test)))
    print('Test F1-score', (measures.f_measure(m.predict(X_wdbc_test_exp.to_numpy()), y_wdbc_test)))

LDA
Test acc 0.916083916083916
Test F1-score 0.88
QDA
Test acc 0.9300699300699301
Test F1-score 0.9038461538461539
LR
Test acc 0.916083916083916
Test F1-score 0.8867924528301887
kNN
Test acc 0.8671328671328671
Test F1-score 0.8155339805825242


Convergence analysis: check how the value of log-likelihood function depends on the number of iterations for 4 above algorithms.