In [1]:
from helpers import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from implementations import *
from utils import *
import utils
import preprocess
import importlib
importlib.reload(utils)
importlib.reload(preprocess)
%load_ext autoreload
%autoreload 2

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/dataset")

In [14]:
avg_accuracy, avg_f1 = cross_validate(
    x_train,
    y_train,
    k=5,
    method='least_squares',
    replace_nan_by=-1,
    column_nan_threshold=0.01,
    row_nan_threshold=1,
    continuous_threshold=0,
    normalization_method='standardize',
    outliers=None,
    z_score_threshold=3,
    max_false_percentage=0.3,
    balance_method='random_upsampling',
    target_minority_ratio=0.2,
    noise_ratio=0.02,
    add_bias=False,
    pca_ratio=0.95,
    decision_threshold=0.2,
    lambda_=0.1,
    initial_w=None,
    max_iters=1000,
    gamma=0.0001,
    preprocess_verbose=True,
    cross_validation_verbose=True
)

Preprocessing for fold 1:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 239373 (91.19%)
  - Minority class (1): 23135 (8.81%)
Minority class upsampled:
  - Class (0): 239373 (80.00%)
  - Class (1): 59843 (20.00%)
PCA performed to reduce features from 315 to 299.

Results for fold 1:
Method: least_squares
Accuracy: 0.8723
F1 Score: 0.4181

Preprocessing for fold 2:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 239386 (91.19%)
  - Minority class (1): 23122 (8.81%)
Minority class upsampled:
  - Class (0): 239386 (80.00%)
  - Class (1): 59846 (20.00%)
PCA performed to reduce features from 315 to 299.

Results for fold 2:
Method: least_squares
Accuracy: 0.8717
F1 Score: 0.4158

Preprocessing for 

In [15]:
predict_with_method(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    test_ids=test_ids,
    method='least_squares',
    lambda_=0.1,
    initial_w=None,
    max_iters=1000,
    gamma=0.01,
    replace_nan_by=-1,
    column_nan_threshold=0.01,
    row_nan_threshold=1,
    continuous_threshold=0,
    normalization_method='standardize',
    outliers=None,
    z_score_threshold=3,
    max_false_percentage=0.3,
    balance_method='random_upsampling',
    target_minority_ratio=0.2,
    noise_ratio=0.02,
    add_bias=False,
    pca_ratio=0.95,
    decision_threshold=0.2,
    preprocess_verbose=True
)

Preprocessing data...
Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 299160 (91.17%)
  - Minority class (1): 28975 (8.83%)
Minority class upsampled:
  - Class (0): 299160 (80.00%)
  - Class (1): 74790 (20.00%)
PCA performed to reduce features from 315 to 299.
Number of predicted labels -1: 95028
Number of predicted labels 1: 14351


array([[328135,     -1],
       [328136,     -1],
       [328137,     -1],
       ...,
       [437511,     -1],
       [437512,      1],
       [437513,     -1]])

In [13]:
# Least squares hyperparameter tuning

best_avg_accuracy = 0
best_avg_f1 = 0

# Best parameters storage
best_params_accuracy = {
    'replace_nan_by': None,
    'column_nan_threshold': None,
    'row_nan_threshold': None,
    'continuous_threshold': None,
    'normalization_method': None,
    'outliers': None,
    'z_score_threshold': None,
    'max_false_percentage': None,
    'balance_method': None,
    'target_minority_ratio': None,
    'noise_ratio': None,
    'add_bias': None,
    'pca_ratio': None,
    'decision_threshold': None
}

best_params_f1 = {
    'replace_nan_by': None,
    'column_nan_threshold': None,
    'row_nan_threshold': None,
    'continuous_threshold': None,
    'normalization_method': None,
    'outliers': None,
    'z_score_threshold': None,
    'max_false_percentage': None,
    'balance_method': None,
    'target_minority_ratio': None,
    'noise_ratio': None,
    'add_bias': None,
    'pca_ratio': None,
    'decision_threshold': None
}

# Iterate through parameter combinations
for replace_nan_by in [-1]:
    for column_nan_threshold in [None]:
        for row_nan_threshold in [None]:
            for continuous_threshold in [None]:
                for normalization_method in ['standardize']:
                    for outliers in [None]:
                        for z_score_threshold in [None]:
                            for max_false_percentage in [None]:
                                for balance_method in ['random_upsampling']:
                                    for target_minority_ratio in [0.2, 0.4, 0.6, 0.8]:
                                        for noise_ratio in [None]:
                                            for add_bias in [False]:
                                                for pca_ratio in [0.95]:
                                                    for decision_threshold in np.arange(0, 0.31, 0.01):

                                                        # Call cross-validation
                                                        avg_accuracy, avg_f1 = cross_validate(
                                                            x_train,
                                                            y_train,
                                                            k=2,
                                                            method='least_squares',
                                                            decision_threshold=decision_threshold,
                                                            replace_nan_by=replace_nan_by,
                                                            column_nan_threshold=column_nan_threshold,
                                                            row_nan_threshold=row_nan_threshold,
                                                            continuous_threshold=continuous_threshold,
                                                            normalization_method=normalization_method,
                                                            outliers=outliers,
                                                            z_score_threshold=z_score_threshold,
                                                            max_false_percentage=max_false_percentage,
                                                            balance_method=balance_method,
                                                            target_minority_ratio=target_minority_ratio,
                                                            noise_ratio=noise_ratio,
                                                            add_bias=add_bias,
                                                            pca_ratio=pca_ratio,
                                                            preprocess_verbose=False,
                                                            cross_validation_verbose=True
                                                        )

                                                        # Update best average accuracy
                                                        if avg_accuracy > best_avg_accuracy:
                                                            best_avg_accuracy = avg_accuracy
                                                            best_params_accuracy.update({
                                                                'replace_nan_by': replace_nan_by,
                                                                'column_nan_threshold': column_nan_threshold,
                                                                'row_nan_threshold': row_nan_threshold,
                                                                'continuous_threshold': continuous_threshold,
                                                                'normalization_method': normalization_method,
                                                                'outliers': outliers,
                                                                'z_score_threshold': z_score_threshold,
                                                                'max_false_percentage': max_false_percentage,
                                                                'balance_method': balance_method,
                                                                'target_minority_ratio': target_minority_ratio,
                                                                'noise_ratio': noise_ratio,
                                                                'add_bias': add_bias,
                                                                'pca_ratio': pca_ratio,
                                                                'decision_threshold': decision_threshold
                                                            })

                                                        # Update best average F1 score
                                                        if avg_f1 > best_avg_f1:
                                                            best_avg_f1 = avg_f1
                                                            best_params_f1.update({
                                                                'replace_nan_by': replace_nan_by,
                                                                'column_nan_threshold': column_nan_threshold,
                                                                'row_nan_threshold': row_nan_threshold,
                                                                'continuous_threshold': continuous_threshold,
                                                                'normalization_method': normalization_method,
                                                                'outliers': outliers,
                                                                'z_score_threshold': z_score_threshold,
                                                                'max_false_percentage': max_false_percentage,
                                                                'balance_method': balance_method,
                                                                'target_minority_ratio': target_minority_ratio,
                                                                'noise_ratio': noise_ratio,
                                                                'add_bias': add_bias,
                                                                'pca_ratio': pca_ratio,
                                                                'decision_threshold': decision_threshold
                                                            })

# Print results
print("Best Average Accuracy:", best_avg_accuracy)
print("Best Parameters for Accuracy:", best_params_accuracy)
print("Best Average F1 Score:", best_avg_f1)
print("Best Parameters for F1 Score:", best_params_f1)


Results for fold 1:
Method: least_squares
Accuracy: 0.6723
F1 Score: 0.3232

Results for fold 2:
Method: least_squares
Accuracy: 0.6738
F1 Score: 0.3202

Method: least_squares
Average accuracy: 0.6730, Average F1-Score: 0.3217

Results for fold 1:
Method: least_squares
Accuracy: 0.6864
F1 Score: 0.3303

Results for fold 2:
Method: least_squares
Accuracy: 0.6880
F1 Score: 0.3270

Method: least_squares
Average accuracy: 0.6872, Average F1-Score: 0.3286

Results for fold 1:
Method: least_squares
Accuracy: 0.7005
F1 Score: 0.3374

Results for fold 2:
Method: least_squares
Accuracy: 0.7016
F1 Score: 0.3341

Method: least_squares
Average accuracy: 0.7011, Average F1-Score: 0.3357

Results for fold 1:
Method: least_squares
Accuracy: 0.7146
F1 Score: 0.3448

Results for fold 2:
Method: least_squares
Accuracy: 0.7153
F1 Score: 0.3412

Method: least_squares
Average accuracy: 0.7150, Average F1-Score: 0.3430

Results for fold 1:
Method: least_squares
Accuracy: 0.7277
F1 Score: 0.3518

Results for