In [1]:
import os
import numpy as np

def load_csv_data(data_path, sub_sample=False):
    """
    This function loads the data and returns the respectinve numpy arrays.
    Remember to put the 3 files in the same folder and to not change the names of the files.

    Args:
        data_path (str): datafolder path
        sub_sample (bool, optional): If True the data will be subsempled. Default to False.

    Returns:
        x_train (np.array): training data
        x_test (np.array): test data
        y_train (np.array): labels for training data in format (-1,1)
        train_ids (np.array): ids of training data
        test_ids (np.array): ids of test data
    """
    y_train = np.genfromtxt(
        os.path.join(data_path, "y_train.csv"),
        delimiter=",",
        skip_header=1,
        dtype=int,
        usecols=1,
    )
    x_train = np.genfromtxt(
        os.path.join(data_path, "x_train.csv"), delimiter=",", skip_header=1
    )
    x_test = np.genfromtxt(
        os.path.join(data_path, "x_test.csv"), delimiter=",", skip_header=1
    )

    train_ids = x_train[:, 0].astype(dtype=int)
    test_ids = x_test[:, 0].astype(dtype=int)
    x_train = x_train[:, 1:]
    x_test = x_test[:, 1:]

    # sub-sample
    if sub_sample:
        y_train = y_train[::50]
        x_train = x_train[::50]
        train_ids = train_ids[::50]

    return x_train, x_test, y_train, train_ids, test_ids


In [2]:
x_train,x_test,y_train,train_ids, test_ids = load_csv_data(r"C:\Users\adrie\OneDrive\Documents\EPFL\ML\Project 1\dataset\dataset\dataset_to_release")


In [83]:
print(x_train[0])

[5.30000000e+01 1.10000000e+01 1.11620150e+07 1.10000000e+01
 1.60000000e+01 2.01500000e+03 1.10000000e+03 2.01501563e+09
 2.01501563e+09            nan            nan            nan
            nan            nan            nan            nan
            nan            nan 1.00000000e+00 1.00000000e+00
 2.00000000e+00 1.00000000e+00            nan 1.00000000e+00
 1.00000000e+00 2.00000000e+00 2.00000000e+00 1.00000000e+00
 5.00000000e+00 8.80000000e+01 1.00000000e+00 1.00000000e+00
 2.00000000e+00 1.00000000e+00 3.00000000e+00            nan
 1.00000000e+00 1.00000000e+00 2.00000000e+00 2.00000000e+00
 2.00000000e+00            nan 2.00000000e+00 2.00000000e+00
 2.00000000e+00 2.00000000e+00 1.00000000e+00 2.00000000e+00
 3.00000000e+00            nan 2.00000000e+00 1.00000000e+00
 5.00000000e+00 1.00000000e+00            nan            nan
            nan 2.00000000e+00 1.00000000e+00 8.80000000e+01
 8.00000000e+00 1.00000000e+00 1.10000000e+02 5.01000000e+02
            nan 1.000000

In [84]:
print(train_ids)

[     0      1      2 ... 328132 328133 328134]


In [23]:
print(np.any(x_train == 888))

True


In [3]:
def cleaning_answers(data):
    datas_cleaned = data.copy()
    
    for i in range(datas_cleaned.shape[1]):
        unique_values = np.unique(datas_cleaned[:,i])
        nbr_unique_values = len(unique_values)
        max_value = np.nanmax(datas_cleaned[:,i])
        median = np.nanmedian(datas_cleaned[:,i])
        conditions = []
        replacement = []
        if nbr_unique_values <= 5 and max_value <= 9:
            if 7 in unique_values:
                conditions.append(datas_cleaned[:,i]==7)
                replacement.append(2)
            if 8 in unique_values:
                conditions.append(datas_cleaned[:,i]==8)
                replacement.append(2)
            if 9 in unique_values:
                conditions.append(datas_cleaned[:,i]==9)
                replacement.append(2)
                
        elif nbr_unique_values >5 and max_value <= 9:
            if 7 in unique_values:
                conditions.append(datas_cleaned[:,i]==7)
                replacement.append(2)
            if 8 in unique_values:
                conditions.append(datas_cleaned[:,i]==8)
                replacement.append(0)
            if 9 in unique_values:
                conditions.append(datas_cleaned[:,i]==9)
                replacement.append(2)
            
        elif (max_value <= 99 and max_value > 9):
            if 77 in unique_values:
                conditions.append(datas_cleaned[:,i]==77)
                replacement.append(median)
            if 88 in unique_values:
                conditions.append(datas_cleaned[:,i]==88)
                replacement.append(0)
            if 99 in unique_values:
                conditions.append(datas_cleaned[:,i]==99)
                replacement.append(median)
            
        elif (max_value <= 999 and max_value > 99 ):
            if 777 in unique_values:
                conditions.append(datas_cleaned[:,i]==777)
                replacement.append(median)
            if 888 in unique_values:
                conditions.append(datas_cleaned[:,i]==888)
                replacement.append(0)
            if 999 in unique_values:
                conditions.append(datas_cleaned[:,i]==999)
                replacement.append(median)
            
        elif max_value > 999 and max_value <= 9999:
            if 7777 in unique_values:
                conditions.append(datas_cleaned[:,i]==7777)
                replacement.append(median)
            if 8888 in unique_values:
                conditions.append(datas_cleaned[:,i]==8888)
                replacement.append(median)
            if 9999 in unique_values:
                conditions.append(datas_cleaned[:,i]==9999)
                replacement.append(median)
                
        elif max_value > 9999 and max_value <= 999999:
            if 777777 in unique_values:
                conditions.append(datas_cleaned[:,i]==777777)
                replacement.append(median)
            if 888888 in unique_values:
                conditions.append(datas_cleaned[:,i]==888888)
                replacement.append(median)
            if 999999 in unique_values:
                conditions.append(datas_cleaned[:,i]==999999)
                replacement.append(median)

        for condition, replacement in zip(conditions, replacement):
            datas_cleaned[condition, i] = replacement
    return datas_cleaned

In [27]:

# Définir un exemple d'ensemble de données
data = np.array([[1, 2, 3, 4],
                 [2, 8, 9, 10],
                 [7, 88, 99, 100],
                 [9, 888, 999, 1000],
                 [1, 8888, 9999, 10000],
                 [2, 888888, 999999, 1000000]])

# Appeler la fonction de nettoyage
cleaned_data = cleaning_answers(data)

# Afficher le résultat
print("Données d'origine :")
print(data)
print("\nDonnées nettoyées :")
print(cleaned_data)


Données d'origine :
[[      1       2       3       4]
 [      2       8       9      10]
 [      7      88      99     100]
 [      9     888     999    1000]
 [      1    8888    9999   10000]
 [      2  888888  999999 1000000]]

Données nettoyées :
[[      1       2       3       4]
 [      2       8       9      10]
 [      2       0     549     100]
 [      2     888     999    1000]
 [      1    8888    9999   10000]
 [      2  888888  999999 1000000]]


In [126]:
x_train_clean01 = cleaning_answers(x_train)

condition 2
condition 2
condition 2
condition 2
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 2
condition 2
condition 2
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 2
condition 1
condition 2
condition 2
condition 2
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 2
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 2
condition 2
condition 1
condition 4
condition 4
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 1
condition 2
condition 1
condition 3
condition 2
condition 2
condition 2
condition 3
condition 3
condition 3
condition 3
condition 3
cond

In [4]:
def remove_useless_col(data):
    col_to_remove = [1,2,3,4,5, 9, 10,11,12,13,18,19,21,22,24,52, 53, 54,60, 98,104,105,119,120,121,122,123,124,125,126,130,131,132,133,166,179,181,211, 212, 216, 217, 219, 220, 221, 222, 226, 227, 228, 229, 235, 236, 237, 239, 240, 241, 244, 245, 246, 256, 286, 310, 311, 316, 317, 320]
    return np.delete(data, col_to_remove, axis=1)

In [129]:
x_train_clean_02 = remove_useless_col(x_train_clean01)

In [5]:
def remove_nan_col(x, nan_percentage = 0.8) :

    #Remove columns in which there are too much NaN values (<80%)
    to_delete = []
    for i in range(x.shape[1] - 1, 1, -1):
        num_NaN = np.count_nonzero(np.isnan(x[:,i]))
        p_NaN = num_NaN / x.shape[0]
        if p_NaN > nan_percentage:
            to_delete.append(i)

    x = x[:, [i for i in range(x.shape[1]) if i not in to_delete]]
    return x

In [9]:
x_train_clean1 = remove_nan_col(x_train_clean_02)

In [10]:
print(x_train_clean1.shape)

(328135, 204)


In [6]:
def clean_data(data):
    clean_datas = data.copy()

    for i in range(data.shape[1]):
        col = clean_datas[:, i]
        is_nan = np.isnan(col)

        if is_nan.any():
            valid_values = col[~is_nan]
            if valid_values.size > 0:
                median = np.median(valid_values)
                col[is_nan] = median

    return clean_datas

In [19]:
x_train_clean2 = clean_data(x_train_clean1)

In [25]:
print(np.isnan(x_train_clean2).any())

False


In [7]:
def remove_outliers(data):

    filtered_data = data.copy()
    for i in range(filtered_data.shape[1]):
        col_data = filtered_data[:,i]
        q1 = np.percentile(col_data,25)
        q3 = np.percentile(col_data,75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5*iqr
        upper_bound = q3 + 1.5*iqr

        col_data[(col_data<lower_bound)|(col_data>upper_bound)] = np.median(col_data)
        filtered_data[:,i]= col_data
    return filtered_data

In [46]:
x_train_clean3 = remove_outliers(x_train_clean2)

<class 'numpy.ndarray'>


In [45]:
data = np.array([[1, 2, 3],
                 [4, 5, 6],
                 [100, 110, 120],
                 [7, 8, 9]])

cleaned_datas = remove_outliers(data)

mean_before = np.mean(data, axis=0)
std_before = np.std(data, axis=0)
mean_after = np.mean(cleaned_datas, axis=0)
std_after = np.std(cleaned_datas, axis=0)
print(data)
print( mean_before)
print(std_before)
print(cleaned_datas)
print( mean_after)
print( std_after)


<class 'numpy.ndarray'>
[[  1   2   3]
 [  4   5   6]
 [100 110 120]
 [  7   8   9]]
[28.   31.25 34.5 ]
[41.62331078 45.51579396 49.40900728]
[[1 2 3]
 [4 5 6]
 [5 6 7]
 [7 8 9]]
[4.25 5.25 6.25]
[2.16506351 2.16506351 2.16506351]


In [8]:
def standardize(data):

    mean = np.mean(data, axis=0)
    std = np.std(data, axis= 0)
    for i in range(len(std)):
        if std[i] < 1e-10:
            std[i] = 1
    standardized_data = (data - mean)/std
    return standardized_data


In [59]:
x_train_clean4 = standardize(x_train_clean3)

In [58]:
data = np.array([[1, 2, 3],
                 [4, 5, 6],
                 [7, 8, 9]])

# Appelez la fonction de standardisation sur les données d'exemple
standardized_data = standardize(data)

# Comparez les statistiques des données avant et après
mean_before = np.mean(data, axis=0)
std_before = np.std(data, axis=0)
mean_after = np.mean(standardized_data, axis=0)
std_after = np.std(standardized_data, axis=0)
print("Statistiques des données avant la standardisation:")
print("Moyenne avant :", mean_before)
print("Écart type avant :", std_before)

print("\nStatistiques des données après la standardisation:")
print("Moyenne après :", mean_after)
print("Écart type après :", std_after)

# Vérifiez visuellement les données standardisées
print(data)
print("Données standardisées :")
print(standardized_data)

Statistiques des données avant la standardisation:
Moyenne avant : [4. 5. 6.]
Écart type avant : [2.44948974 2.44948974 2.44948974]

Statistiques des données après la standardisation:
Moyenne après : [0. 0. 0.]
Écart type après : [1. 1. 1.]
[[1 2 3]
 [4 5 6]
 [7 8 9]]
Données standardisées :
[[-1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487]]


In [9]:
def remove_correlated_columns(data, correlation_threshold=0.7):
    indices=[]

    for i in range(data.shape[1]):
        for j in range(i+1,data.shape[1]):
            col1 = data[:,i]
            col2 = data[:,j]
            corr = np.corrcoef(col1, col2)

            if (np.abs(corr[0][1]) >= correlation_threshold):
                indices.append(j)

    uncorrelated_data = np.delete(data,indices,1)
    return uncorrelated_data

In [None]:
x_train_clean5 = remove_correlated_columns(x_train_clean4)

In [123]:
data = np.array([[1, 2, 3, 5],
                 [4, 5, 6, 12],
                 [-7, 8, 9, 3.5 ],
                 [10, 11, 12, 54]])

datas_cleaned = remove_correlated_columns(data)
print(data)
print(datas_cleaned)

[[  1.    2.    3.    5. ]
 [  4.    5.    6.  -12. ]
 [ -7.    8.    9.    3.5]
 [ 10.   11.   12.   54. ]]
[[ 1.  2.]
 [ 4.  5.]
 [-7.  8.]
 [10. 11.]]


In [10]:
def clean_all(data):
    data_to_compute = remove_useless_col(data)
    print(data_to_compute.shape)
    data_to_compute = cleaning_answers(data_to_compute)
    print(data_to_compute.shape)
    data_to_compute = remove_nan_col(data_to_compute)
    print(data_to_compute.shape)
    data_to_compute = clean_data(data_to_compute)    
    print(data_to_compute.shape)
    data_to_compute = remove_outliers(data_to_compute)
    print(data_to_compute.shape)
    data_to_compute = standardize(data_to_compute)
    print(data_to_compute.shape)
    data_to_compute = remove_correlated_columns(data_to_compute)
    print(data_to_compute.shape)
    return data_to_compute.copy()

In [11]:
new_datas = clean_all(x_train)

(328135, 256)
(328135, 256)
(328135, 160)
(328135, 160)
(328135, 160)
(328135, 160)


  c /= stddev[:, None]
  c /= stddev[None, :]


(328135, 135)


In [38]:

max_iters = 500
initial_w = np.ones((new_datas.shape[1],1))
gamma = 0.1
print(initial_w.shape)
print(new_datas.shape)


(135, 1)
(328135, 135)


In [36]:
def compute_gradient(y, tx, w):
    """Computes the gradient at w.

    Args:
        y: shape=(N, )
        tx: shape=(N,2)
        w: shape=(2, ). The vector of model parameters.

    Returns:
        An array of shape (2, ) (same shape as w), containing the gradient of the loss at w.
    """
    y = y.reshape(-1,1)

    e = y - tx.dot(w)
    gradient = (-1/y.shape[0])*np.dot(tx.T,e)
    return gradient


In [34]:
def compute_mse(y, tx, w):
    """compute the loss by mse.
    Args:
        y: numpy array of shape (N,), N is the number of samples.
        tx: numpy array of shape (N,D), D is the number of features.
        w: weights, numpy array of shape(D,), D is the number of features.

    Returns:
        mse: scalar corresponding to the mse with factor (1 / 2 n) in front of the sum

    >>> compute_mse(np.array([0.1,0.2]), np.array([[2.3, 3.2], [1., 0.1]]), np.array([0.03947092, 0.00319628]))
    0.006417022764962313
    """
    y = y.reshape(-1,1)
    e = y - tx.dot(w)
    mse = e.T.dot(e) / (2 * len(e))
    return mse

In [44]:
def gradient_descent(y, tx, initial_w, max_iters, gamma):
    """The Gradient Descent (GD) algorithm.

    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of GD
        gamma: a scalar denoting the stepsize

    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of GD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of GD
    """
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w)
        loss = compute_mse(y, tx, w)

        # update w by gradient
        w = w - gamma * grad
        # store w and loss
        ws.append(w)
        losses.append(loss)
        if n_iter % 20 == 0:
            print(
                "GD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}, w3={w3}".format(
                    bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1], w3= w[3]
                )
            )

    return w, losses

In [20]:
def mean_squared_error_gd(y, tx, initial_w,max_iters, gamma):
    #gradient descent
    losses, ws = gradient_descent(y, tx, initial_w, max_iters, gamma)

    #find the best w
    if gamma <= 2 :
        loss = losses[-1]
        w = ws[-1]
    else :
        loss = np.min(losses)
        w = ws[np.argmin(losses)]

    return w, loss

In [45]:
w_mse_gd,loss_gd = mean_squared_error_gd(y_train,new_datas,initial_w,max_iters,gamma)

GD iter. 0/499: loss=[[46.85457533]], w0=[0.92350205], w1=[1.], w3=[1.]
GD iter. 20/499: loss=[[1.13636405]], w0=[0.13669419], w1=[1.], w3=[1.]
GD iter. 40/499: loss=[[0.59984439]], w0=[0.02272274], w1=[1.], w3=[1.]
GD iter. 60/499: loss=[[0.51866159]], w0=[0.00506661], w1=[1.], w3=[1.]
GD iter. 80/499: loss=[[0.4978925]], w0=[0.00167864], w1=[1.], w3=[1.]
GD iter. 100/499: loss=[[0.49070465]], w0=[0.0008718], w1=[1.], w3=[1.]
GD iter. 120/499: loss=[[0.48749855]], w0=[0.00069102], w1=[1.], w3=[1.]
GD iter. 140/499: loss=[[0.48576556]], w0=[0.00070431], w1=[1.], w3=[1.]
GD iter. 160/499: loss=[[0.48470841]], w0=[0.00078138], w1=[1.], w3=[1.]
GD iter. 180/499: loss=[[0.48401859]], w0=[0.00087453], w1=[1.], w3=[1.]
GD iter. 200/499: loss=[[0.48355206]], w0=[0.00096535], w1=[1.], w3=[1.]
GD iter. 220/499: loss=[[0.48323055]], w0=[0.00104722], w1=[1.], w3=[1.]
GD iter. 240/499: loss=[[0.48300672]], w0=[0.00111845], w1=[1.], w3=[1.]
GD iter. 260/499: loss=[[0.48285004]], w0=[0.00117934], w1