In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

In [5]:
train = pd.read_csv('train.csv')
cddd = pd.read_csv('cddd.csv')


## Removing missing values 
We define a fonction to remove the missing values (NaN) in a dataset.

In [3]:
def NaN_remover(df): 
    """
    Remove the NaN values from a Dataframe. 
    
    Parameters : 
    - df (pd.dataframe): Dataframe containing NaN.
    
    Returns : 
    - pd.dataframe : Original Dataframe without the lines containing NaN values 
    """
    df_without_NaN = df.copy()
    df_without_NaN.dropna(inplace=True)
    return df_without_NaN

We also define a NaN checker function. 

In [4]:
def NaN_checker(df, pt=False) : 
    """
    Check for the presence of NaN in a Dataframe.
    
    Parameters: 
    - df (pd.dataframe): Dataframe potentially containing NaN.
    
    Returns : 
    - nan_check : boolean value indicating if there is NaN values in the Dataframe. 
    """
    nan_check = df.isna().any().any()
    if pt == True :
        print("Il y a des NaN dans le DataFrame :", nan_check)
    return nan_check
    

Here, we apply the NaN_checker function to the train.csv and cddd.csv datasets. 

In [7]:

NaN_checker(train, True)
NaN_checker(cddd, True)

Il y a des NaN dans le DataFrame : False
Il y a des NaN dans le DataFrame : True


True

We see that there is some NaN values in the cddd dataset. We could either remove them or replacing them with the column mean. For now we are just going to remove them with the NaN_remover function. 

In [8]:
cddd_without_NaN = NaN_remover(cddd)

## Removing constant predictors 
We define a function to remove the constant predictors from a dataset.

First of all we need to define a function that separates numercical columns from the other in Dataframe. 

In [5]:
def numerical_separator(df): 
    """
    Separate the numerical columns for the other in a Dataframe. 
    
    Parameters: 
    - df (pd.dataframe) : Dataframe containing numerical and non numerical columns 
    
    Return: 
    - colonnes_non_numeriques : non numerical columns from the Original Dataframe 
    - colonnes_numeriques : numerical colums form the Original Dataframe
    """
    colonnes_numeriques = df.select_dtypes(include=['number'])
    colonnes_non_numeriques = df.select_dtypes(exclude=['number'])
    return  colonnes_non_numeriques, colonnes_numeriques

In [6]:
def constant_predictors_remover(df) : 
    """
    Remove constant columns from a Dataframe. 
    
    Parameters : 
    - df (pd.dataframe) : Dataframe containing constant columns.
    
    Returns : 
    - pd.dtaframe : Original Dataframe without constant columns.
    """
    non_numerical_columns, numerical_columns = numerical_separator(df) #splitting the non numerical and numerical columns in data frame
    std_per_column = np.std(numerical_columns, axis=0) #calculating std for each numerical column 
    non_constant_columns = std_per_column[std_per_column != 0].index #selectioning non constant columns
    df_clean_const_numerical = df.loc[:, non_constant_columns]
    df_clean_const = pd.concat([non_numerical_columns, df_clean_const_numerical], axis=1)
    return df_clean_const

    

Here we apply the function to remove to constant columns to train.csv and cddd.csv.

In [7]:
train_clean_const = constant_predictors_remover(train)
cddd_clean_const = constant_predictors_remover(cddd_without_NaN)

print(train_clean_const.shape)
print(train.shape) #same shapes => there was no constant column in the train Dataframe

print(cddd_clean_const.shape)
print(cddd_without_NaN.shape) #same shapes => there was no constant column in the cddd Datafrmae 


(3500, 1029)
(3500, 1029)
(1472, 513)
(1472, 513)


## Removing correlated predictors
We define a function to remove the perfectly correlated predictors from a dataset.

In [8]:
def correlation_remover(df) : 
    """
    Remove the correlated columns in a Dataframe. 
    
    Parameters : 
    - df (pd.dataframe) : Dataframe with some perfectly correlated columns. 
    
    Retruns : 
    - pd.dataframe : Original Dataframe with only non correlated columns (one of each group of prefectly correlated columns remains).
    """
    non_numerical_columns, numerical_columns = numerical_separator(df)
    correlation = np.array(numerical_columns.corr().values)
    correlation = np.triu(correlation, k=0)
    np.fill_diagonal(correlation,0)
    df_clean_corr_numerical = numerical_columns.drop(numerical_columns.columns[np.where(correlation==1)[1]], axis=1)
    df_clean_corr = pd.concat([non_numerical_columns, df_clean_corr_numerical], axis=1)
    return df_clean_corr

Here we apply the function to the train.csv and cddd.csv datasets. 

In [9]:
train_clean_corr = correlation_remover(train)
cddd_clean_corr = correlation_remover(cddd_without_NaN)

print(train.shape)
print(train_clean_corr.shape) #same shapes => no perfectly correlated predictors in the train Dataframe

print(cddd_without_NaN.shape)
print(cddd_clean_corr.shape) #same shapes => no perfectly correlated predictors in the cddd Dataframe

(3500, 1029)
(3500, 1029)
(1472, 513)
(1472, 513)


## Strandardization 
We define a function to standardize our datasets.

In [10]:

def standardizer(df, target='RT'):
    """
    Standardize numerical features in a DataFrame, including the target feature if present.

    Parameters:
    - df (pd.DataFrame): DataFrame to standardize.
    - target (str): Name of the target feature (default is 'RT').

    Returns:
    - pd.DataFrame: Original DataFrame with standardized numerical features.
    """
    non_numerical_columns, numerical_columns = numerical_separator(df)

    # Extract the target feature, if present
    if target in numerical_columns.columns:
        target_feature = numerical_columns[target]
        numerical_columns = numerical_columns.drop(columns=[target])

        # Standardize numerical features
        scaler = StandardScaler()
        scaled_numerical_data = pd.DataFrame(scaler.fit_transform(numerical_columns),
                                             columns=numerical_columns.columns)

        # Concatenate non-numerical columns, target feature, and scaled numerical data
        non_numerical_columns.reset_index(drop=True, inplace=True)
        scaled_numerical_data.reset_index(drop=True, inplace=True)
        df_scaled = pd.concat([non_numerical_columns, target_feature, scaled_numerical_data], axis=1)

    else:
        # Standardize numerical features (excluding target feature)
        scaler = StandardScaler()
        scaled_numerical_data = pd.DataFrame(scaler.fit_transform(numerical_columns),
                                             columns=numerical_columns.columns)

        # Concatenate non-numerical columns and scaled numerical data
        non_numerical_columns.reset_index(drop=True, inplace=True)
        scaled_numerical_data.reset_index(drop=True, inplace=True)
        df_scaled = pd.concat([non_numerical_columns, scaled_numerical_data], axis=1)

    return df_scaled


Here we apply the function to train.csv and cddd.csv. 

In [11]:
train_scaled = standardizer(train)
cddd_scaled = standardizer(cddd_without_NaN)

## Clean data saving
We save the obtained datasets in the repository. 

In [12]:
current_directory = os.getcwd()
output_path = os.path.join(current_directory, "train_scaled.csv")
train_scaled.to_csv(output_path, index=False)
output_path = os.path.join(current_directory, "cddd_scaled.csv")
cddd_scaled.to_csv(output_path, index=False)