In [1]:
import os
import numpy as np

def load_csv_data(data_path, sub_sample=False):
    """
    This function loads the data and returns the respectinve numpy arrays.
    Remember to put the 3 files in the same folder and to not change the names of the files.

    Args:
        data_path (str): datafolder path
        sub_sample (bool, optional): If True the data will be subsempled. Default to False.

    Returns:
        x_train (np.array): training data
        x_test (np.array): test data
        y_train (np.array): labels for training data in format (-1,1)
        train_ids (np.array): ids of training data
        test_ids (np.array): ids of test data
    """
    y_train = np.genfromtxt(
        os.path.join(data_path, "y_train.csv"),
        delimiter=",",
        skip_header=1,
        dtype=int,
        usecols=1,
    )
    x_train = np.genfromtxt(
        os.path.join(data_path, "x_train.csv"), delimiter=",", skip_header=1
    )
    x_test = np.genfromtxt(
        os.path.join(data_path, "x_test.csv"), delimiter=",", skip_header=1
    )

    train_ids = x_train[:, 0].astype(dtype=int)
    test_ids = x_test[:, 0].astype(dtype=int)
    x_train = x_train[:, 1:]
    x_test = x_test[:, 1:]

    # sub-sample
    if sub_sample:
        y_train = y_train[::50]
        x_train = x_train[::50]
        train_ids = train_ids[::50]

    return x_train, x_test, y_train, train_ids, test_ids


In [2]:
x_train,x_test,y_train,train_ids, test_ids = load_csv_data(r"C:\Users\Home\Documents\EPFL\ML\Project1\dataset\dataset\dataset_to_release",sub_sample=True)


In [3]:
print(x_train[0,:])

In [4]:
column_numbers = np.arange(1, x_train.shape[1] + 1)
x_train = np.vstack((column_numbers, x_train))
print(x_train[0,:])

(109379, 66)
(328135, 66)


In [5]:
def convert_categorical_col(matrix, categorical_indices):
    unique_values = {}
    
    for col_index in categorical_indices:
        unique_values[col_index] = list(set(row[col_index] for row in matrix))
    num_unique = {col_index: len(unique_values[col_index])for col_index in categorical_indices}
    total_unique = sum(num_unique.values())
    
    num_rows = len(matrix)

    new_matrix = np.empty((num_rows,total_unique),dtype=int)
    current_col = 0
    for col_index in categorical_indices:
        num_unique_col = num_unique[col_index]
        for i, row in enumerate(matrix):
            category = row[col_index]
            new_matrix[i,current_col:current_col+num_unique_col] = [1 if category == unique_value else 0 for unique_value in unique_values]
            
        current_col += num_unique_col
                
    
    return np.array(new_matrix)

[  5034 271803 120350 ... 312201 267336 128037]


In [6]:
def convert_categorical_columnsv2(matrix, categorical_indices):
    unique_values = {}

    # Initialize unique values for each categorical column
    for col_index in categorical_indices:
        unique_values[col_index] = []


    # Iterate through the data to find all unique values
    for row in matrix:
        for col_index in categorical_indices:
            value = row[col_index]
            if value not in unique_values[col_index]:
                unique_values[col_index].append(value)

    new_matrix = []

    for row in matrix:
        new_row = []
        for col_index in range(len(row)):
            if col_index not in categorical_indices:
                new_row.append(row[col_index])
            else:
                category = row[col_index]
                for unique_value in unique_values[col_index]:
                    new_row.append(1 if category == unique_value else 0)
        new_matrix.append(new_row)

    return np.array(new_matrix)

In [7]:
def categorize_col(data):
    discrete_values_col = []
    
    for i in range(data.shape[1]):
        unique_values = np.unique(data[:,i])
        nbr_unique_values = len(unique_values)
        max_value = np.nanmax(data[:,i])

        if nbr_unique_values <= 5 and max_value <= 9:
            discrete_values_col.append(i)
            
        elif nbr_unique_values >5 and max_value <= 9:
            discrete_values_col.append(i)
            
    all_col = set(range(data.shape[1]))
    continuous_values_col = list(all_col-set((discrete_values_col)))
    return discrete_values_col, continuous_values_col


In [8]:
discrete_col, continuous_col = categorize_col(x_train[1:,:])

{'degree': 0, 'gamma': 0.016, 'initial_w': 'random'}


In [8]:
print(x_train[0,:])

In [9]:
print(discrete_col)

Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detec

In [10]:
print(continuous_col)

{'degree': 0, 'gamma': 0.005, 'initial_w': 'random'}


In [11]:
x_train_categorized  = convert_categorical_col(x_train,discrete_col)

In [12]:
def cleaning_answers(data):
    datas_cleaned = data    
    header_row = datas_cleaned[0, :].copy()
    
    for i in range(datas_cleaned.shape[1]):
        unique_values = np.unique(datas_cleaned[:,i])
        nbr_unique_values = len(unique_values)
        max_value = np.nanmax(datas_cleaned[:,i])
        median = np.nanmedian(datas_cleaned[:,i])
        conditions = []
        replacement = []
        if nbr_unique_values <= 5 and max_value <= 9:
            if 7 in unique_values:
                conditions.append(datas_cleaned[:,i]==7)
                replacement.append(2)
            if 8 in unique_values:
                conditions.append(datas_cleaned[:,i]==8)
                replacement.append(2)
            if 9 in unique_values:
                conditions.append(datas_cleaned[:,i]==9)
                replacement.append(2)
                
        elif nbr_unique_values >5 and max_value <= 9:
            if 7 in unique_values:
                conditions.append(datas_cleaned[:,i]==7)
                replacement.append(2)
            if 8 in unique_values:
                conditions.append(datas_cleaned[:,i]==8)
                replacement.append(0)
            if 9 in unique_values:
                conditions.append(datas_cleaned[:,i]==9)
                replacement.append(2)
            
        elif (max_value <= 99 and max_value > 9):
            if 77 in unique_values:
                conditions.append(datas_cleaned[:,i]==77)
                replacement.append(median)
            if 88 in unique_values:
                conditions.append(datas_cleaned[:,i]==88)
                replacement.append(0)
            if 99 in unique_values:
                conditions.append(datas_cleaned[:,i]==99)
                replacement.append(median)
            
        elif (max_value <= 999 and max_value > 99 ):
            if 777 in unique_values:
                conditions.append(datas_cleaned[:,i]==777)
                replacement.append(median)
            if 888 in unique_values:
                conditions.append(datas_cleaned[:,i]==888)
                replacement.append(0)
            if 999 in unique_values:
                conditions.append(datas_cleaned[:,i]==999)
                replacement.append(median)
            
        elif max_value > 999 and max_value <= 9999:
            if 7777 in unique_values:
                conditions.append(datas_cleaned[:,i]==7777)
                replacement.append(median)
            if 8888 in unique_values:
                conditions.append(datas_cleaned[:,i]==8888)
                replacement.append(median)
            if 9999 in unique_values:
                conditions.append(datas_cleaned[:,i]==9999)
                replacement.append(median)
                
        elif max_value > 9999 and max_value <= 999999:
            if 777777 in unique_values:
                conditions.append(datas_cleaned[:,i]==777777)
                replacement.append(median)
            if 888888 in unique_values:
                conditions.append(datas_cleaned[:,i]==888888)
                replacement.append(median)
            if 999999 in unique_values:
                conditions.append(datas_cleaned[:,i]==999999)
                replacement.append(median)

        for condition, replacement in zip(conditions, replacement):
            datas_cleaned[condition, i] = replacement

        datas_cleaned[0, :] = header_row
    return datas_cleaned

{'degree': 0, 'lambda': 0.0001}


In [13]:
def remove_useless_col(data):
    col_to_remove = [1,2,3,4,5, 9, 10,11,12,13,18,19,21,22,24,52, 53, 54,60, 98,104,105,119,120,121,122,123,124,125,126,130,131,132,133,166,179,181,211, 212, 216, 217, 219, 220, 221, 222, 226, 227, 228, 229, 235, 236, 237, 239, 240, 241, 244, 245, 246, 256, 286, 310, 311, 316, 317, 320]
    return np.delete(data, col_to_remove, axis=1)

Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.
Divergence detected. Stopping iteration.


In [16]:
def remove_nan_col(x, nan_percentage = 0.8) :

    #Remove columns in which there are too much NaN values (<80%)
    to_delete = []
    for i in range(x.shape[1] - 1, 1, -1):
        num_NaN = np.count_nonzero(np.isnan(x[:,i]))
        p_NaN = num_NaN / x.shape[0]
        if p_NaN > nan_percentage:
            to_delete.append(i)

    x = x[:, [i for i in range(x.shape[1]) if i not in to_delete]]
    return x

KeyboardInterrupt: 

In [17]:
def remove_remaining_nan(data, continuous_indices, discrete_indices):
    changed_nan_values = data.copy()
    print(changed_nan_values.shape)
    for i in discrete_indices:
        col = changed_nan_values[:, i]
        is_nan = np.isnan(col)
        if is_nan.any():
            col[is_nan] = 0
            
    for i in continuous_indices:
        col = changed_nan_values[:,i]
        is_nan = np.isnan(col)
        if is_nan.any():
            valid_values = col[~is_nan]
            if valid_values.size > 0:
                median = np.median(valid_values)
                col[is_nan] = median
    return changed_nan_values

In [18]:
def remove_outliers(data, continuous_indices):
    filtered_data = data.copy()    
    for i in continuous_indices:
        col_data = data[:, i]
        q1 = np.percentile(col_data, 25)
        q3 = np.percentile(col_data, 75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        col_data[(col_data < lower_bound) | (col_data > upper_bound)] = np.median(col_data)
        filtered_data[:,i] = col_data
    return filtered_data

Divergence detected. Stopping iteration.


In [19]:
def standardize(data, continuous_indices):

    mean = np.mean(data[:,:], axis=0)
    std = np.std(data[:,:], axis= 0)
    for i in range(len(std)):
        if std[i] < 1e-10:
            std[i] = 1
            
    standardized_data = data.copy()
    for  idx in continuous_indices:
        standardized_data[:,idx] = (data[:,idx] - mean[idx])/std[idx]
        
    return standardized_data


{'GD': {'degree': 0, 'gamma': 0.016, 'initial_w': 'random'}, 'SGD': {'degree': 0, 'gamma': 0.005, 'initial_w': 'random'}, 'LS': None, 'Ridge': {'degree': 0, 'lambda': 0.0001}, 'Log': {'degree': 0, 'gamma': 0.015, 'initial_w': 'random'}, 'RegLog': None}


In [20]:
def remove_correlated_columns(data, correlation_threshold=0.7):
    indices=[]

    for i in range(data.shape[1]):
        for j in range(i+1,data.shape[1]):
            col1 = data[:,i]
            col2 = data[:,j]
            corr = np.corrcoef(col1, col2)

            if (np.abs(corr[0][1]) >= correlation_threshold):
                indices.append(j)

    uncorrelated_data = np.delete(data,indices,1)
    return uncorrelated_data

[[ 2.60461975e-03]
 [ 4.98068851e-03]
 [ 3.27291282e-02]
 [-2.76556969e-02]
 [-1.22951638e-02]
 [-5.07472568e-03]
 [ 7.37337577e-02]
 [ 4.45055586e-01]
 [-2.85600512e-02]
 [ 3.75919639e-02]
 [ 4.94138442e-03]
 [ 3.86408771e-02]
 [-2.99388128e-02]
 [-8.22832177e-02]
 [-1.75554267e-03]
 [-3.19043204e-02]
 [ 1.54660578e-02]
 [ 3.51275145e-03]
 [ 9.45498526e-03]
 [-2.49731041e-03]
 [-9.85466136e-03]
 [ 1.15124607e-03]
 [-7.38573739e-04]
 [ 6.38755881e-05]
 [-6.52756268e-03]
 [ 3.12979554e-02]
 [-6.71403166e-03]
 [-8.00397530e-03]
 [-2.43955146e-02]
 [ 9.30642206e-03]
 [-4.75743470e-03]
 [ 1.15157244e-01]
 [ 4.86172153e-01]
 [ 6.66345778e-01]
 [ 6.35970103e-02]
 [ 5.98213715e-02]
 [ 3.63995498e-03]
 [-2.24483755e-02]
 [ 2.31136640e-03]
 [-3.97704890e-03]
 [-1.34932261e-03]
 [ 1.07337925e-02]
 [ 1.98907929e-02]
 [-1.61038727e-03]
 [-1.36825329e-02]
 [-5.42440853e-03]
 [ 4.61627125e-03]
 [-3.84644245e-03]
 [ 5.64292231e-03]
 [ 1.29998596e-02]
 [ 3.83486473e-03]
 [-2.47460016e-02]
 [-1.5215639

In [21]:
data = np.array([[1, 2, 3, 5],
                 [4, 5, 6, 12],
                 [-7, 8, 9, 3.5 ],
                 [10, 11, 12, 54]])

datas_cleaned = remove_correlated_columns(data)
print(data)
print(datas_cleaned)

In [22]:
def calculate_r_squared(X):
    r_squared_values = []
    for i in range(X.shape[1]):
        x0 = X[:, i]
        x_remaining = np.delete(X, i, axis=1)

        # Coefficients de la régression linéaire
        coeffs = np.linalg.lstsq(x_remaining, x0, rcond=None)[0]

        # Prédiction
        prediction = x_remaining.dot(coeffs)

        # Calcul du R carré
        r_squared = 1 - np.var(x0 - prediction) / np.var(x0)
        r_squared_values.append(r_squared)

    return r_squared_values

In [23]:
def remove_highly_collinear_vars(data, r_squared_values, threshold=0.95):
    # Obtenir les indices des variables avec un R² supérieur au seuil
    high_collinearity_indices = [i for i, r2 in enumerate(r_squared_values) if r2 >= threshold]

    # Éliminer ces variables du dataset
    cleaned_data = np.delete(data, high_collinearity_indices, axis=1)

    return cleaned_data, high_collinearity_indices


{'GD': 0.14516319199492816, 'SGD': 0.14386662029075184, 'LS': 0.27995994213864467, 'Ridge': 0.27995994213864467, 'Log': 0.14521802597670277, 'RegLog': None}


In [24]:
def count_high_r_squared(r_squared_values, threshold=0.95):
    # Compter le nombre de valeurs de R² qui sont supérieures au seuil
    high_r_squared_count = sum(1 for r2 in r_squared_values if r2 >= threshold)

    return high_r_squared_count



In [25]:
def clean_all(data):
    
    data_to_compute = remove_useless_col(data)
    print(data_to_compute.shape)
    
    
    data_to_compute = remove_nan_col(data_to_compute)
    print(np.sum(np.nanvar(data_to_compute, axis=0)==0))

    print(np.linalg.cond(data_to_compute))
    
    discrete_columns, continuous_columns = categorize_col(data_to_compute)
    
    data_to_compute = remove_remaining_nan(data_to_compute,continuous_columns,discrete_columns)
    print(data_to_compute.shape)
    
    r_squared_values = calculate_r_squared(data_to_compute)
    high_r_squared_count = count_high_r_squared(r_squared_values)

    # Affichage du nombre de valeurs de R² élevées
    print(f"Nombre de valeurs de R² supérieures à 0.95 : {high_r_squared_count}")
    
    data_to_compute = standardize(data_to_compute, continuous_columns)
    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))
    print(np.sum(np.nanvar(data_to_compute, axis=0)==0))
    print(data_to_compute.shape)
    print(np.linalg.cond(data_to_compute))

    r_squared_values = calculate_r_squared(data_to_compute)
    high_r_squared_count = count_high_r_squared(r_squared_values)
    data_to_compute, removed_indices = remove_highly_collinear_vars(data_to_compute,r_squared_values)
    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))
    # Affichage du nombre de valeurs de R² élevées
    print(f"Nombre de valeurs de R² supérieures à 0.95 : {high_r_squared_count}")
    data_to_compute = remove_correlated_columns(data_to_compute)
    print(np.sum(np.nanvar(data_to_compute, axis=0)==0))
    print(data_to_compute.shape)
    print(np.linalg.cond(data_to_compute))
    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))
    r_squared_values = calculate_r_squared(data_to_compute)
    high_r_squared_count = count_high_r_squared(r_squared_values)

    # Affichage du nombre de valeurs de R² élevées
    print(f"Nombre de valeurs de R² supérieures à 0.95 : {high_r_squared_count}")
    discrete_columns, continuous_columns = categorize_col(data_to_compute)

    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))


    data_to_compute = convert_categorical_columnsv2(data_to_compute,discrete_columns)
    r_squared_values = calculate_r_squared(data_to_compute)
    high_r_squared_count = count_high_r_squared(r_squared_values)

    # Affichage du nombre de valeurs de R² élevées
    print(f"Nombre de valeurs de R² supérieures à 0.95 : {high_r_squared_count}")
    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))
    data_to_compute = remove_outliers(data_to_compute,continuous_columns)
    r_squared_values = calculate_r_squared(data_to_compute)
    high_r_squared_count = count_high_r_squared(r_squared_values)

    # Affichage du nombre de valeurs de R² élevées
    print(f"Nombre de valeurs de R² supérieures à 0.95 : {high_r_squared_count}")
    print(np.linalg.det(data_to_compute.T.dot(data_to_compute)))
    print(data_to_compute.shape)
    return data_to_compute

LS


In [26]:
new_datas = clean_all(x_train)

In [27]:
new_datas_array = np.array(new_datas)
print(new_datas_array.shape)

(66,)
(109379, 66)


In [28]:
print(np.sum(np.nanvar(new_datas_array, axis=0)==0))


In [29]:
def remove_columns_with_zero_variance(matrix):
    """
    Remove columns with zero variance from a matrix.

    Args:
        matrix: numpy array of shape (N, M), where N is the number of rows and M is the number of columns.

    Returns:
        matrix_without_zero_variance: numpy array of shape (N, M'), where M' is less than or equal to M,
                                       containing the matrix with columns having zero variance removed.
    """
    # Calculate the variance of each column (excluding the first row)
    column_variances = np.var(matrix, axis=0)

    # Find the indices of columns with non-zero variance
    non_zero_variance_indices = np.where(column_variances > 0)[0]

    # Remove columns with zero variance
    matrix_without_zero_variance = matrix[:, non_zero_variance_indices]

    return matrix_without_zero_variance


In [63]:
new_datas_array = remove_columns_with_zero_variance(new_datas_array)

0.0


In [64]:
new_datas_array = remove_correlated_columns(new_datas_array)

Column 0:
[19. 30. 44.]
Column 1:
[1100. 1100. 1100.]
Column 2:
[2.01500210e+09 2.01500428e+09 2.01500631e+09]
Column 3:
[2.01500210e+09 2.01500428e+09 2.01500631e+09]
Column 4:
[0. 1. 2.]
Column 5:
[0. 0. 1.]
Column 6:
[0. 0. 1.]
Column 7:
[0. 0. 1.]
Column 8:
[0. 0. 1.]
Column 9:
[0. 0. 2.]
Column 10:
[2. 2. 3.]
Column 11:
[0. 0. 0.]
Column 12:
[0. 0. 0.]
Column 13:
[0. 0. 0.]
Column 14:
[1. 1. 1.]
Column 15:
[1. 1. 1.]
Column 16:
[2. 2. 2.]
Column 17:
[1. 1. 1.]
Column 18:
[1. 3. 3.]
Column 19:
[1. 1. 1.]
Column 20:
[1. 1. 1.]
Column 21:
[1. 1. 2.]
Column 22:
[2. 2. 2.]
Column 23:
[2. 2. 2.]
Column 24:
[2. 2. 2.]
Column 25:
[2. 2. 2.]
Column 26:
[2. 2. 2.]
Column 27:
[1. 2. 2.]
Column 28:
[2. 2. 2.]
Column 29:
[2. 2. 2.]
Column 30:
[3. 3. 3.]
Column 31:
[1. 2. 2.]
Column 32:
[1. 1. 3.]
Column 33:
[0. 1. 1.]
Column 34:
[2. 2. 2.]
Column 35:
[0. 1. 1.]
Column 36:
[0. 0. 0.]
Column 37:
[1. 1. 1.]
Column 38:
[150. 174. 194.]
Column 39:
[504. 506. 508.]
Column 40:
[1. 2. 2.]
Column 41:
[

In [65]:
print(new_datas_array.shape)

0.0


In [66]:
print(np.max(new_datas_array,axis=0))

Column 0:
[19. 30. 44.]
Column 1:
[2.01500210e+09 2.01500428e+09 2.01500631e+09]
Column 2:
[2.01500210e+09 2.01500428e+09 2.01500631e+09]
Column 3:
[0. 1. 2.]
Column 4:
[0. 0. 1.]
Column 5:
[0. 0. 1.]
Column 6:
[0. 0. 1.]
Column 7:
[0. 0. 1.]
Column 8:
[0. 0. 2.]
Column 9:
[2. 2. 3.]
Column 10:
[1. 3. 3.]
Column 11:
[1. 1. 2.]
Column 12:
[1. 2. 2.]
Column 13:
[1. 2. 2.]
Column 14:
[1. 1. 3.]
Column 15:
[0. 1. 1.]
Column 16:
[0. 1. 1.]
Column 17:
[150. 174. 194.]
Column 18:
[504. 506. 508.]
Column 19:
[1. 2. 2.]
Column 20:
[1. 2. 2.]
Column 21:
[0. 0. 3.]
Column 22:
[0. 0. 1.]
Column 23:
[0. 0. 2.]
Column 24:
[202. 305. 555.]
Column 25:
[102. 202. 303.]
Column 26:
[203. 302. 303.]
Column 27:
[202. 204. 304.]
Column 28:
[202. 302. 303.]
Column 29:
[102. 203. 308.]
Column 30:
[ 0. 37. 64.]
Column 31:
[30. 45. 45.]
Column 32:
[ 0.  0. 64.]
Column 33:
[1. 1. 2.]
Column 34:
[1. 1. 2.]
Column 35:
[0. 1. 2.]
Column 36:
[0. 1. 1.]
Column 37:
[1. 1. 2.]
Column 38:
[1. 1. 2.]
Column 39:
[1. 2. 2.

In [67]:
print(np.linalg.det(new_datas_array.T.dot(new_datas_array)))

Columns removed: [1, 2, 12, 17, 39, 41, 45, 65, 66, 74, 75, 76, 77, 78]
68
1.5860001321443876e+276
Column 0:
[19. 30. 44.]
Column 1:
[0. 1. 2.]
Column 2:
[0. 0. 1.]
Column 3:
[0. 0. 1.]
Column 4:
[0. 0. 1.]
Column 5:
[0. 0. 1.]
Column 6:
[0. 0. 2.]
Column 7:
[2. 2. 3.]
Column 8:
[1. 3. 3.]
Column 9:
[1. 1. 2.]
Column 10:
[1. 2. 2.]
Column 11:
[1. 1. 3.]
Column 12:
[0. 1. 1.]
Column 13:
[0. 1. 1.]
Column 14:
[504. 506. 508.]
Column 15:
[1. 2. 2.]
Column 16:
[1. 2. 2.]
Column 17:
[0. 0. 3.]
Column 18:
[0. 0. 1.]
Column 19:
[0. 0. 2.]
Column 20:
[202. 305. 555.]
Column 21:
[102. 202. 303.]
Column 22:
[203. 302. 303.]
Column 23:
[202. 204. 304.]
Column 24:
[202. 302. 303.]
Column 25:
[102. 203. 308.]
Column 26:
[ 0. 37. 64.]
Column 27:
[30. 45. 45.]
Column 28:
[ 0.  0. 64.]
Column 29:
[1. 1. 2.]
Column 30:
[1. 1. 2.]
Column 31:
[0. 1. 2.]
Column 32:
[0. 1. 1.]
Column 33:
[1. 1. 2.]
Column 34:
[1. 1. 2.]
Column 35:
[1. 1. 2.]
Column 36:
[3. 5. 6.]
Column 37:
[64. 66. 70.]
Column 38:
[1.63 1

In [68]:
print(np.var(new_datas.T.dot(new_datas), axis=0))

In [69]:

max_iters = 500
initial_w = np.ones((new_datas_array.shape[1],1))
gamma = 0.1
print(initial_w.shape)
print(new_datas_array.shape)


66
6.075041705971307e+271
Column 0:
[19. 30. 44.]
Column 1:
[0. 1. 2.]
Column 2:
[0. 0. 1.]
Column 3:
[0. 0. 1.]
Column 4:
[0. 0. 1.]
Column 5:
[0. 0. 1.]
Column 6:
[0. 0. 2.]
Column 7:
[2. 2. 3.]
Column 8:
[1. 3. 3.]
Column 9:
[1. 1. 2.]
Column 10:
[1. 2. 2.]
Column 11:
[1. 1. 3.]
Column 12:
[0. 1. 1.]
Column 13:
[0. 1. 1.]
Column 14:
[504. 506. 508.]
Column 15:
[1. 2. 2.]
Column 16:
[1. 2. 2.]
Column 17:
[0. 0. 3.]
Column 18:
[0. 0. 1.]
Column 19:
[0. 0. 2.]
Column 20:
[202. 305. 555.]
Column 21:
[102. 202. 303.]
Column 22:
[203. 302. 303.]
Column 23:
[202. 204. 304.]
Column 24:
[202. 302. 303.]
Column 25:
[102. 203. 308.]
Column 26:
[ 0. 37. 64.]
Column 27:
[30. 45. 45.]
Column 28:
[ 0.  0. 64.]
Column 29:
[1. 1. 2.]
Column 30:
[1. 1. 2.]
Column 31:
[0. 1. 2.]
Column 32:
[0. 1. 1.]
Column 33:
[1. 1. 2.]
Column 34:
[1. 1. 2.]
Column 35:
[1. 1. 2.]
Column 36:
[3. 5. 6.]
Column 37:
[64. 66. 70.]
Column 38:
[1.63 1.7  1.78]
Column 39:
[24.03 27.   29.26]
Column 40:
[2. 3. 4.]
Column 41:

In [121]:
def compute_gradient(y, tx, w):
    """Computes the gradient at w.

    Args:
        y: shape=(N, )
        tx: shape=(N,2)
        w: shape=(2, ). The vector of model parameters.

    Returns:
        An array of shape (2, ) (same shape as w), containing the gradient of the loss at w.
    """
    y = y.reshape(-1,1)

    e = y - tx.dot(w)
    gradient = (-1/y.shape[0])*np.dot(tx.T,e)
    return gradient


In [116]:
def compute_mse(y, tx, w):
    """compute the loss by mse.
    Args:
        y: numpy array of shape (N,), N is the number of samples.
        tx: numpy array of shape (N,D), D is the number of features.
        w: weights, numpy array of shape(D,), D is the number of features.

    Returns:
        mse: scalar corresponding to the mse with factor (1 / 2 n) in front of the sum

    >>> compute_mse(np.array([0.1,0.2]), np.array([[2.3, 3.2], [1., 0.1]]), np.array([0.03947092, 0.00319628]))
    0.006417022764962313
    """
    y = y.reshape(-1,1)
    e = y - tx.dot(w)
    mse = e.T.dot(e) / (2 * len(e))
    return mse

(6563, 114)


In [156]:
def gradient_descent(y, tx, initial_w, max_iters, gamma):
    """The Gradient Descent (GD) algorithm.

    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of GD
        gamma: a scalar denoting the stepsize

    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of GD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of GD
    """
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w)
        loss = compute_mse(y, tx, w)

        # update w by gradient
        w = w - gamma * grad
        # store w and loss
        ws.append(w)
        losses.append(loss)
        if n_iter % 20 == 0:
            print(
                "GD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}, w3={w3}".format(
                    bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1], w3= w[3]
                )
            )

    return ws, losses

In [89]:
def mean_squared_error_gd(y, tx, initial_w,max_iters, gamma):
    #gradient descent
    ws, losses = gradient_descent(y, tx, initial_w, max_iters, gamma)
    print(losses[-1])
    print(ws[-1])
    #find the best w
    if gamma <= 2 :
        loss = losses[-1]
        w = ws[-1]
    else :
        loss = np.min(losses)
        w = ws[np.argmin(losses)]

    return w, loss

In [157]:
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.
    """
    # set seed
    np.random.seed(seed)

    idx = np.random.permutation(np.arange(len(x)))
    idx_max = np.floor(ratio * len(x)).astype(int)
        
    print(len(x))
    x_tr = x[idx][:idx_max]
    x_te = x[idx][idx_max:]
    y_tr = y[idx][:idx_max]
    y_te = y[idx][idx_max:]

    return x_tr, x_te, y_tr, y_te


GD iter. 0/499: loss=[[1.20015781e+09]], w0=[873447.71536927], w1=[37282725.95720807], w3=[16422.88967746]
GD iter. 20/499: loss=[[inf]], w0=[1.13905058e+203], w1=[4.8524995e+204], w3=[2.23370613e+201]
GD iter. 40/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 60/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 80/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 100/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 120/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 140/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 160/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 180/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 200/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 220/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 240/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 260/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter. 280/499: loss=[[nan]], w0=[nan], w1=[nan], w3=[nan]
GD iter.

In [50]:
print(np.max(new_datas_array, axis=0))

[2.62144441 2.97790053 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.03300162 0.95642885
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1. 

In [123]:
x_tr, x_te, y_tr, y_te  = split_data(new_datas_array, y_train, 0.75)

In [126]:
print(x_tr.shape)

ValueError: shapes (114,4922) and (6563,) not aligned: 4922 (dim 1) != 6563 (dim 0)

In [149]:
initial_w = np.ones((x_tr.shape[1],1))

In [150]:
w_mse_gd,loss_gd = mean_squared_error_gd(y_tr,x_tr,initial_w,max_iters,gamma)

In [152]:
y_prediction_mse_gd = x_te.dot(w_mse_gd)

[[0.16255744 0.15411523 0.17188395 ... 0.13765922 0.16807256 0.15621745]
 [0.15411523 0.15714534 0.15076774 ... 0.16305178 0.15213573 0.1563908 ]
 [0.17188395 0.15076774 0.19521204 ... 0.10960685 0.18567874 0.15602593]
 ...
 [0.13765922 0.16305178 0.10960685 ... 0.21254835 0.12107078 0.15672872]
 [0.16807256 0.15213573 0.18567874 ... 0.12107078 0.17848377 0.1561042 ]
 [0.15621745 0.1563908  0.15602593 ... 0.15672872 0.1561042  0.15634763]]


In [167]:
print(y_prediction_mse_gd.shape)

In [175]:
print(y_te.shape)

In [169]:

def compute_f1_score(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_pred == y_true))
    print(tp)
    fp = np.sum((y_pred == 1) & (y_pred != y_true))
    print(fp)
    fn = np.sum((y_pred == -1) & (y_pred != y_true))
    print(fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score



In [173]:
print(w_mse_gd)

In [179]:
from functions import *
y_pred_mse_gd  = np.where(sigmoid(y_prediction_mse_gd) >= 0.5 , 1, -1)

  return 1 / (1 + np.exp(-t))


(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(112, 1)
(

In [180]:
y_te = y_te.reshape(-1,1)

[[-3.34584778e+03]
 [-1.24458854e+05]
 [-8.14938545e+01]
 [-5.89069806e+01]
 [-1.11478076e+02]
 [-2.47042749e+02]
 [-2.70684520e+02]
 [-2.61891546e+02]
 [-1.54764327e+02]
 [-1.20594203e+02]
 [-1.55268852e+02]
 [-2.10925333e+02]
 [-1.66884623e+02]
 [-2.73068590e+02]
 [-3.81415407e+01]
 [-1.22438873e+02]
 [-1.32978867e+02]
 [-1.55055210e+02]
 [-2.23249066e+02]
 [-2.09033601e+02]
 [-2.17024135e+02]
 [-2.15898687e+02]
 [-2.18118124e+02]
 [-2.06155011e+02]
 [-2.22240267e+02]
 [-3.23480784e+02]
 [-1.80686831e+02]
 [-2.47761137e+02]
 [-6.82775595e+01]
 [-2.15094271e+02]
 [-1.61010349e+02]
 [-7.30758287e+01]
 [-1.26570811e+02]
 [-2.27849320e+04]
 [-6.51147715e+04]
 [-1.98799738e+02]
 [-2.12772395e+02]
 [-2.13782494e+02]
 [-2.09209882e+02]
 [-2.06880719e+02]
 [-2.14583733e+02]
 [-2.09874846e+02]
 [-1.49374895e+02]
 [-3.16019467e+02]
 [-2.11878359e+04]
 [-1.21890875e+02]
 [-6.60426855e+01]
 [-1.80535775e+02]
 [-3.90971059e+04]
 [-2.18926224e+04]
 [-3.36905762e+04]
 [-2.63110444e+04]
 [-3.1927807

In [181]:
compute_f1_score(y_te,y_pred_mse_gd)

-0.633897512985037


In [36]:
def least_squares(y, tx):
    """Calculate the least squares solution.
       returns mse, and optimal weights.

    Args:
        y: numpy array of shape (N,), N is the number of samples.
        tx: numpy array of shape (N,D), D is the number of features.

    Returns:
        w: optimal weights, numpy array of shape(D,), D is the number of features.
        loss: scalar.

    >>> least_squares(np.array([0.1,0.2]), np.array([[2.3, 3.2], [1., 0.1]]))
    (array([ 0.21212121, -0.12121212]), 8.666684749742561e-33)
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # least squares: TODO
    w =np.linalg.inv(tx.T.dot(tx)).dot(tx.T.dot(y))
    loss = compute_mse(y,tx,w)
    # returns mse, and optimal weights
    return w, loss


In [38]:
w_ls, loss_ls = least_squares(y_tr,x_tr)

LinAlgError: Singular matrix

In [50]:
x_train [0,:]

array([5.30000000e+01, 1.10000000e+01, 1.11620150e+07, 1.10000000e+01,
       1.60000000e+01, 2.01500000e+03, 1.10000000e+03, 2.01501563e+09,
       2.01501563e+09,            nan,            nan,            nan,
                  nan,            nan,            nan,            nan,
                  nan,            nan, 1.00000000e+00, 1.00000000e+00,
       2.00000000e+00, 1.00000000e+00,            nan, 1.00000000e+00,
       1.00000000e+00, 2.00000000e+00, 2.00000000e+00, 1.00000000e+00,
       5.00000000e+00, 8.80000000e+01, 1.00000000e+00, 1.00000000e+00,
       2.00000000e+00, 1.00000000e+00, 3.00000000e+00,            nan,
       1.00000000e+00, 1.00000000e+00, 2.00000000e+00, 2.00000000e+00,
       2.00000000e+00,            nan, 2.00000000e+00, 2.00000000e+00,
       2.00000000e+00, 2.00000000e+00, 1.00000000e+00, 2.00000000e+00,
       3.00000000e+00,            nan, 2.00000000e+00, 1.00000000e+00,
       5.00000000e+00, 1.00000000e+00,            nan,            nan,
      

In [51]:
new_datas[0,:]

array([ 1.4362964 , -0.14364958, -0.43476107,  0.25938522,  3.93728018,
       -0.43123454,  0.81144924,  0.7531099 ,  0.70796839,  0.85611713,
       -0.76306165, -0.71285571, -0.46366208, -1.76453973, -1.78855258,
       -1.18243681, -0.97423391,  1.1803198 ,  0.03202097,  0.67464233,
        1.1061937 ,  0.77930911, -0.08702525, -1.77907099, -0.65705562,
        2.4488373 ,  0.78489186, -1.14277651,  0.7440304 ,  0.42427984,
       -0.86106585,  0.06966508,  1.06815889,  0.68648567, -0.57520585,
       -0.54244303, -0.02121267, -0.35061803, -1.12280884,  0.40954926,
        0.2195254 , -0.59099421, -0.60817162,  1.13393598, -0.65035991,
       -0.69873338, -0.13209185,  0.1241444 , -0.87671029, -1.09107931,
       -1.23021922,  3.29264174, -0.51980681,  2.04970769, -0.43981471,
       -0.36569101, -0.32011707])

In [54]:
print(np.nanvar(x_train, axis=0))


[2.57017100e+02 1.21609628e+01 1.21748122e+13 1.21991885e+01
 6.94635961e+01 2.41451390e-02 1.27603975e+03 1.69617878e+07
 1.69617878e+07 0.00000000e+00 1.69534623e-04 0.00000000e+00
 0.00000000e+00 2.48007961e-01 2.25585938e-01 6.40064375e-01
 3.85500182e-01 2.76670029e-01 0.00000000e+00 0.00000000e+00
 2.49979515e-01 5.71951661e-03 0.00000000e+00 6.24434611e-02
 3.06947845e-01 2.71298432e+01 1.24249554e+00 1.37287610e+03
 1.28392985e+03 1.45131943e+03 2.54732199e-01 6.89049997e-01
 1.66170929e-01 1.55044157e+00 1.06631052e+00 2.00038110e-01
 8.25643511e-01 1.06998248e+00 5.29728614e-01 9.74834622e-02
 1.97295819e-01 1.19064806e+00 1.52814524e-01 1.45580975e-01
 1.97825891e-01 3.91263998e-01 2.96164491e-01 1.14101404e-01
 5.08995468e-01 3.31839648e+02 2.44062500e-01 2.84387923e+00
 1.15085242e+00 6.73492991e-01 2.15799145e-01 1.40449090e+00
 4.42115532e-01 2.00268060e-01 8.23997381e+00 1.42394937e+03
 1.00746754e+03 3.26773219e-01 4.84386838e+06 1.88022650e+06
 3.31551231e-01 4.413437

In [49]:
x_test = clean_all(x_test)
x_test = remove_zero_variance_columns(x_test)

(109379, 256)
(109379, 256)
(109379, 160)
(109379, 160)
(109379, 160)
(109379, 160)


  c /= stddev[:, None]
  c /= stddev[None, :]


(109379, 136)


In [48]:
y_prediction = x_test.dot(w_ls)

ValueError: shapes (109379,321) and (57,) not aligned: 321 (dim 1) != 57 (dim 0)

In [32]:
print(x_tr[:,-4])

[ 1.41142851 -1.18588987 -1.18588987 ... -1.18588987  0.54565572
  1.41142851]


In [33]:
print(x_tr[:,4])

[0. 0. 0. ... 0. 0. 0.]


In [34]:
y_tr.shape[0]

278914

In [35]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss.

    Args:
        y:  shape=(N, 1)
        tx: shape=(N, D)
        w:  shape=(D, 1)

    Returns:
        a vector of shape (D, 1)

    >>> np.set_printoptions(8)
    >>> y = np.c_[[0., 1.]]
    >>> tx = np.arange(6).reshape(2, 3)
    >>> w = np.array([[0.1], [0.2], [0.3]])
    >>> calculate_gradient(y, tx, w)
    array([[-0.10370763],
           [ 0.2067104 ],
           [ 0.51712843]])
    """

    return 1/len(y)*(tx.T.dot(sigmoid(np.dot(tx, w))-y))


In [36]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood.

    Args:
        y:  shape=(N, 1)
        tx: shape=(N, D)
        w:  shape=(D, 1)

    Returns:
        a non-negative loss

    >>> y = np.c_[[0., 1.]]
    >>> tx = np.arange(4).reshape(2, 2)
    >>> w = np.c_[[2., 3.]]
    >>> round(calculate_loss(y, tx, w), 8)
    1.52429481
    """
    assert y.shape[0] == tx.shape[0]
    assert tx.shape[1] == w.shape[0]

    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    loss = -1/y.shape[0]*(y.T.dot(np.log1p(sigmoid(tx.dot(w))))+(1-y).T.dot(np.log1p(1-sigmoid(tx.dot(w)))))
    return loss.ravel()[0]
    # ***************************************************


In [37]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    w = initial_w
    assert y.shape[0] == tx.shape[0]
    assert tx.shape[1] == w.shape[0]

    for iter in range(max_iters):

        # LOSS
        loss = calculate_loss(y, tx, w)

        # GRADIENT
        grad = calculate_gradient(y, tx, w)

        # UPDATE W
        w -= gamma * grad

    return w, loss


In [38]:
w_logr, loss_logr = logistic_regression(y_tr,x_tr,initial_w,max_iters,gamma)

MemoryError: Unable to allocate 580. GiB for an array with shape (278914, 278914) and data type float64

In [None]:
from implementations import *

In [None]:
initial_w = np.ones((x_tr.shape[1],1))
w_reg_logr, loss_reg_logr = reg_logistic_regression(y_tr,x_tr,0.1,initial_w,max_iters,gamma)

In [None]:
test_1 = np.array([[2.3, 3.2]])
print(np.linalg.det(test_1.T.dot(test_1)))