In [106]:
from helpers import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from implementations import *
from utils import *
import utils
import preprocess
import importlib
importlib.reload(utils)
importlib.reload(preprocess)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/dataset")

In [110]:
avg_accuracy, avg_f1 = cross_validate(
    x_train,
    y_train,
    k=5,
    method='least_squares',
    replace_nan_by=-1,
    column_nan_threshold=0.01,
    row_nan_threshold=0.01,
    continuous_threshold=0,
    normalization_method='standardize',
    outliers='remove',
    z_score_threshold=3,
    max_false_percentage=0.3,
    balance_method=None,
    target_minority_ratio=0.6,
    add_bias=False,
    pca_components=50,
    lambda_=0.1,
    initial_w=None,
    max_iters=1000,
    gamma=0.01,
    preprocess_verbose=True,
    cross_validation_verbose=True
)

Preprocessing for fold 1:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Removed 0 outliers (z-score > 3).
PCA performs to reach 50 components.

Results for fold 1:
Method: least_squares
Accuracy: 0.6019
F1 Score: 0.2877

Preprocessing for fold 2:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Removed 0 outliers (z-score > 3).
PCA performs to reach 50 components.

Results for fold 2:
Method: least_squares
Accuracy: 0.6032
F1 Score: 0.2884

Preprocessing for fold 3:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Removed 0 outliers (z-score > 3).
PCA performs to reach 50 components.

Results for fold 3:
Method: least_squares
Accuracy: 0.6042
F1 Score: 0.2862

Preproc

In [108]:
predict_with_method(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    test_ids=test_ids,
    method='least_squares',
    lambda_=0.1,
    initial_w=None,
    max_iters=1000,
    gamma=0.01,
    replace_nan_by=-1,
    column_nan_threshold=0.01,
    row_nan_threshold=0.01,
    continuous_threshold=0,
    normalization_method='standardize',
    outliers='remove',
    z_score_threshold=3,
    max_false_percentage=0.3,
    balance_method=None,
    target_minority_ratio=0,
    add_bias=False,
    pca_components=50,
    preprocess_verbose=True
)

Preprocessing data...
Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Removed 0 outliers (z-score > 3).
PCA performs to reach 50 components.
Number of predicted labels -1: 58343
Number of predicted labels 1: 51036


array([[328135,      1],
       [328136,     -1],
       [328137,      1],
       ...,
       [437511,      1],
       [437512,      1],
       [437513,      1]])

In [98]:
# Least squares hyperparameter tuning

best_avg_accuracy = 0
best_avg_f1 = 0

# Best parameters storage
best_params_accuracy = {
    'replace_nan_by': None,
    'column_nan_threshold': None,
    'row_nan_threshold': None,
    'continuous_threshold': None,
    'normalization_method': None,
    'outliers': None,
    'z_score_threshold': None,
    'max_false_percentage': None,
    'balance_method': None,
    'target_minority_ratio': None,
    'add_bias': None,
    'pca_components': None
}

best_params_f1 = {
    'replace_nan_by': None,
    'column_nan_threshold': None,
    'row_nan_threshold': None,
    'continuous_threshold': None,
    'normalization_method': None,
    'outliers': None,
    'z_score_threshold': None,
    'max_false_percentage': None,
    'balance_method': None,
    'target_minority_ratio': None,
    'add_bias': None,
    'pca_components': None
}

# Iterate through parameter combinations
for replace_nan_by in [None, -1]:
    for column_nan_threshold in [0.01, 0.05]:
        for row_nan_threshold in [0.01, 0.05]:
            for continuous_threshold in [0]:
                for normalization_method in ['standardize', 'normalize']:
                    for outliers in ['remove', 'clip']:
                        for z_score_threshold in [3]:
                            for max_false_percentage in [0.1, 0.3, 0.5]:
                                for balance_method in ['downsampling', 'upsampling']:
                                    for target_minority_ratio in [0.25, 0.5, 0.75, 1]:
                                        for add_bias in [False, True]:
                                            for pca_components in [10, 20, 30, 40, 50, 60, 70]:

                                                # Call cross-validation
                                                avg_accuracy, avg_f1 = cross_validate(
                                                    x_train,
                                                    y_train,
                                                    k=5,
                                                    method='least_squares',
                                                    replace_nan_by=replace_nan_by,
                                                    column_nan_threshold=column_nan_threshold,
                                                    row_nan_threshold=row_nan_threshold,
                                                    continuous_threshold=continuous_threshold,
                                                    normalization_method=normalization_method,
                                                    outliers=outliers,
                                                    z_score_threshold=z_score_threshold,
                                                    max_false_percentage=max_false_percentage,
                                                    balance_method=balance_method,
                                                    target_minority_ratio=target_minority_ratio,
                                                    add_bias=add_bias,
                                                    pca_components=pca_components,
                                                    preprocess_verbose=False,
                                                    cross_validation_verbose=True
                                                )

                                                # Update best average accuracy
                                                if avg_accuracy > best_avg_accuracy:
                                                    best_avg_accuracy = avg_accuracy
                                                    best_params_accuracy.update({
                                                        'replace_nan_by': replace_nan_by,
                                                        'column_nan_threshold': column_nan_threshold,
                                                        'row_nan_threshold': row_nan_threshold,
                                                        'continuous_threshold': continuous_threshold,
                                                        'normalization_method': normalization_method,
                                                        'outliers': outliers,
                                                        'z_score_threshold': z_score_threshold,
                                                        'max_false_percentage': max_false_percentage,
                                                        'balance_method': balance_method,
                                                        'target_minority_ratio': target_minority_ratio,
                                                        'add_bias': add_bias,
                                                        'pca_components': pca_components
                                                    })

                                                # Update best average F1 score
                                                if avg_f1 > best_avg_f1:
                                                    best_avg_f1 = avg_f1
                                                    best_params_f1.update({
                                                        'replace_nan_by': replace_nan_by,
                                                        'column_nan_threshold': column_nan_threshold,
                                                        'row_nan_threshold': row_nan_threshold,
                                                        'continuous_threshold': continuous_threshold,
                                                        'normalization_method': normalization_method,
                                                        'outliers': outliers,
                                                        'z_score_threshold': z_score_threshold,
                                                        'max_false_percentage': max_false_percentage,
                                                        'balance_method': balance_method,
                                                        'target_minority_ratio': target_minority_ratio,
                                                        'add_bias': add_bias,
                                                        'pca_components': pca_components
                                                    })

# Print results
print("Best Average Accuracy:", best_avg_accuracy)
print("Best Parameters for Accuracy:", best_params_accuracy)
print("Best Average F1 Score:", best_avg_f1)
print("Best Parameters for F1 Score:", best_params_f1)


Results for fold 1:
Method: least_squares
Accuracy: 0.6009
F1 Score: 0.2823

Results for fold 2:
Method: least_squares
Accuracy: 0.6017
F1 Score: 0.2816

Results for fold 3:
Method: least_squares
Accuracy: 0.5988
F1 Score: 0.2765

Results for fold 4:
Method: least_squares
Accuracy: 0.6009
F1 Score: 0.2788

Results for fold 5:
Method: least_squares
Accuracy: 0.6029
F1 Score: 0.2742

Method: least_squares
Average accuracy: 0.6011, Average F1-Score: 0.2787

Results for fold 1:
Method: least_squares
Accuracy: 0.6155
F1 Score: 0.2953

Results for fold 2:
Method: least_squares
Accuracy: 0.6165
F1 Score: 0.2954

Results for fold 3:
Method: least_squares
Accuracy: 0.6155
F1 Score: 0.2917

Results for fold 4:
Method: least_squares
Accuracy: 0.6167
F1 Score: 0.2934

Results for fold 5:
Method: least_squares
Accuracy: 0.6173
F1 Score: 0.2873

Method: least_squares
Average accuracy: 0.6163, Average F1-Score: 0.2926

Results for fold 1:
Method: least_squares
Accuracy: 0.6297
F1 Score: 0.2994

Resul

Traceback (most recent call last):
  File "/Users/matthiaswyss/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/dg/jjw_9gyd1p38f4_jp_6s89r00000gn/T/ipykernel_95536/2327733267.py", line 52, in <module>
    avg_accuracy, avg_f1 = cross_validate(
                           ^^^^^^^^^^^^^^^
  File "/Users/matthiaswyss/Documents/EPFL/2024-25/MA1/CS-433 Machine learning/Project 1/utils.py", line 437, in cross_validate
    preprocessed_x_train, preprocessed_x_test, preprocessed_y_train = preprocess(
                                                                      ^^^^^^^^^^^
  File "/Users/matthiaswyss/Documents/EPFL/2024-25/MA1/CS-433 Machine learning/Project 1/preprocess.py", line 53, in preprocess
    x_train, x_test = remove_single_value_columns(x_train, x_test, verbose=verbose)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  