In [56]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import os 
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from scipy.stats import chi2
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error

def round_column_names(precision=1):
    """
    Arrondit les noms des colonnes numériques à une précision donnée
    et les convertit en entiers si l'arrondi donne un nombre entier.

    Args:
        df (pd.DataFrame): DataFrame avec les colonnes à renommer.
        precision (int): Précision de l'arrondi (par défaut : 1 décimale).

    Returns:
        pd.DataFrame: DataFrame avec les noms de colonnes arrondis.
    """
    # Vérifiez que df est bien un DataFrame
    df = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances.csv')
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f"L'entrée doit être un DataFrame, mais {type(df)} a été fourni.")

    # Nouveau dictionnaire pour mapper les anciens noms aux nouveaux
    new_columns = {}
    for col in df.columns:
        try:
            # Tenter de convertir le nom de la colonne en float pour arrondi
            rounded_col = round(float(col), precision)
            # Si l'arrondi donne un entier, convertir en entier
            if rounded_col.is_integer():
                new_columns[col] = str(int(rounded_col))
            else:
                new_columns[col] = str(rounded_col)
        except ValueError:
            # Conserver les colonnes non convertibles (par exemple, chaînes)
            new_columns[col] = col

    # Renommer les colonnes dans le DataFrame
    return df.rename(columns=new_columns)

def add_purity_column():
    """
    Ajoute une colonne 'PURITY' remplie de 0 au dataset substances.csv.

    Args:
        file_path (str): Chemin vers le fichier CSV d'entrée.
        output_path (str): Chemin pour sauvegarder le fichier CSV avec la colonne ajoutée.

    Returns:
        pd.DataFrame: DataFrame avec la colonne 'PURITY' ajoutée.
    """
    # Charger le fichier CSV
    data = round_column_names()

    # Ajouter la colonne 'PURITY' avec des valeurs remplies à 0
    data.insert(1, 'PURITY', 0)
    
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'

    # Sauvegarder le fichier mis à jour
    data.to_csv(output_path, index=False)

    return data

def add_measure_type_display():
    data = add_purity_column()
    data.insert(0, 'measure_type_display', 'Direct contact') 
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'

    # Sauvegarder le fichier mis à jour
    data.to_csv(output_path, index=False)

    return data

def add_substance_form_display():
    """
    Ajoute une colonne 'substance_form_display' remplie selon une distribution spécifiée
    et l'insère avant toute colonne existante nommée 'substance_form_display'.

    La distribution est :
    - 'Homogenized Powder' : 630/1250
    - 'Non Homogenized Powder' : 200/1400
    - 'Unspecified' : 420/1250
    """
    # Appel de la fonction existante pour récupérer les données
    data = add_measure_type_display()

    # Définir les catégories et leurs probabilités
    categories = ['Homogenized Powder', 'Non Homogenized Powder', 'Unspecified']
    probabilities = [630/1250, 200/1250, 420/1250]

    # Générer les valeurs aléatoires pour la nouvelle colonne
    new_column_values = np.random.choice(categories, size=len(data), p=probabilities)

    # Trouver l'index de la colonne 'substance_form_display' existante
    insert_position = data.columns.get_loc('measure_type_display')

    # Insérer la nouvelle colonne avant l'ancienne
    data.insert(insert_position, 'substance_form_display', new_column_values)

    # Chemin de sortie
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'

    # Sauvegarder le fichier mis à jour
    data.to_csv(output_path, index=False)

    print(f"Fichier sauvegardé avec la colonne 'substance_form_display' ajoutée avant la position existante : {output_path}")
    return data

def rename_and_modify_substance_column():
    """
    Extrait la colonne 'substance', la renomme en 'prod_substance', 
    remplace toutes les valeurs par 'Heroin', et sauvegarde le fichier mis à jour.

    Args:
        file_path (str): Chemin vers le fichier CSV d'entrée.
        output_path (str): Chemin pour sauvegarder le fichier modifié.

    Returns:
        pd.DataFrame: DataFrame avec la colonne modifiée.
    """
    # Charger le fichier CSV
    data = add_substance_form_display()

    # Vérifier si la colonne 'substance' existe
    if 'substance' not in data.columns:
        raise ValueError("La colonne 'substance' n'existe pas dans le fichier.")

    # Renommer la colonne en 'prod_substance' et remplacer toutes les valeurs par 'Heroin'
    data = data.rename(columns={'substance': 'prod_substance'})
    data['prod_substance'] = 'Heroin'

    # Sauvegarder le fichier modifié
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'
    data.to_csv(output_path, index=False)

    print(f"Fichier sauvegardé avec la colonne 'prod_substance' modifiée : {output_path}")
    return data

def device_serial_proba():
    """
    Calcule les probabilités de chaque catégorie dans la colonne 'device_serial'.

    Args:
        file_path (str): Chemin vers le fichier CSV.

    Returns:
        categories (list): Liste des catégories uniques dans 'device_serial'.
        probabilities (list): Liste des probabilités associées à chaque catégorie.
    """
    # Charger les données
    data, data2, data3 = preprocessing_v1()

    # Calculer la répartition et les probabilités
    device_serial_distribution = data['device_serial'].value_counts(normalize=True)
    categories = device_serial_distribution.index.tolist()
    probabilities = device_serial_distribution.values.tolist()

    return categories, probabilities

def add_device_serial():
    """
    Ajoute une colonne 'device_serial' remplie selon une distribution de probabilité
    et insère cette colonne avant 'substance_form_display'.

    Args:
        input_file (str): Chemin du fichier d'entrée.
        output_file (str): Chemin pour sauvegarder le fichier modifié.

    Returns:
        pd.DataFrame: DataFrame mis à jour.
    """
    # Charger les données existantes
    data = rename_and_modify_substance_column()

    # Obtenir les catégories et probabilités de 'device_serial'
    categories, probabilities = device_serial_proba()

    # Générer les valeurs aléatoires pour la nouvelle colonne
    new_column_values = np.random.choice(categories, size=len(data), p=probabilities)

    # Trouver la position de 'substance_form_display'
    if 'substance_form_display' in data.columns:
        insert_position = data.columns.get_loc('substance_form_display')
    else:
        raise ValueError("La colonne 'substance_form_display' n'existe pas dans le fichier.")

    # Insérer la nouvelle colonne avant 'substance_form_display'
    data.insert(insert_position, 'device_serial', new_column_values)

    # Sauvegarder le fichier mis à jour
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'
    data.to_csv(output_path, index=False)
    print(f"Fichier sauvegardé avec la colonne 'device_serial' ajoutée ")
    
    return data

def sample_name_proba():
    """
    Calcule les probabilités de chaque catégorie dans la colonne 'device_serial'.

    Args:
        file_path (str): Chemin vers le fichier CSV.

    Returns:
        categories (list): Liste des catégories uniques dans 'device_serial'.
        probabilities (list): Liste des probabilités associées à chaque catégorie.
    """
    # Charger les données
    data1, data2, data3 = preprocessing_v1()

    # Calculer la répartition et les probabilités
    sample_name_distribution = data1['sample_name'].value_counts(normalize=True)
    categories = sample_name_distribution.index.tolist()
    probabilities = sample_name_distribution.values.tolist()

    return categories, probabilities

def add_sample_name(): 
    """
    Ajoute une colonne 'device_serial' remplie selon une distribution de probabilité
    et insère cette colonne avant 'substance_form_display'.

    Args:
        input_file (str): Chemin du fichier d'entrée.
        output_file (str): Chemin pour sauvegarder le fichier modifié.

    Returns:
        pd.DataFrame: DataFrame mis à jour.
    """
    # Charger les données existantes
    data = add_device_serial()

    # Obtenir les catégories et probabilités de 'device_serial'
    categories, probabilities = sample_name_proba()

    # Générer les valeurs aléatoires pour la nouvelle colonne
    new_column_values = np.random.choice(categories, size=len(data), p=probabilities)

    # Trouver la position de 'device_serial'
    if 'device_serial' in data.columns:
        insert_position = data.columns.get_loc('device_serial')
    else:
        raise ValueError("La colonne 'device_serial' n'existe pas dans le fichier.")

    # Insérer la nouvelle colonne avant 'substance_form_display'
    data.insert(insert_position, 'sample_name', new_column_values)

    # Sauvegarder le fichier mis à jour
    output_path = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv'
    data.to_csv(output_path, index=False)
    print(f"Fichier sauvegardé avec la colonne 'sample_name' ajoutée ")
    return data        

def merge_csv_files():
    """
    Combine les lignes de deux fichiers CSV et sauvegarde le résultat dans un nouveau fichier.

    Args:
        train_file (str): Chemin vers le fichier train.csv.
        substances_file (str): Chemin vers le fichier substances.csv.
        output_file (str): Chemin pour sauvegarder le fichier combiné.

    Returns:
        pd.DataFrame: DataFrame combiné.
    """
    # Charger les deux fichiers CSV
    train_data, data2, data3 = preprocessing_v1()
    substances_data = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/substances_modified.csv')
    

    # Combiner les données
    combined_data = pd.concat([train_data, substances_data], ignore_index=False)

    # Sauvegarder le fichier combiné
    output_file = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_final.csv'
    combined_data.to_csv(output_file, index=False)

    return combined_data
 
def remove_outliers_mahalanobis(data, threshold=0.99):
    """
    Remove multivariate outliers using Mahalanobis distance.
    
    Parameters:
    - data: DataFrame of numeric features.
    - threshold: Chi-squared threshold for outlier removal (default 0.99).
    
    Returns:
    - DataFrame without outliers.
    """
    cov_matrix = np.cov(data, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mean_vec = data.mean(axis=0)
    
    def mahalanobis(x):
        diff = x - mean_vec
        return np.sqrt(diff.T @ inv_cov_matrix @ diff)
    
    mahalanobis_distances = data.apply(mahalanobis, axis=1)
    chi2_threshold = chi2.ppf(threshold, df=data.shape[1])
    non_outliers = mahalanobis_distances <= np.sqrt(chi2_threshold)
    
    return data[non_outliers].reset_index(drop=True)

def preprocessing_v1(apply_one_hot=False, apply_scaling=False, apply_pca=False, apply_correlation=False, apply_remove_outliers=False, apply_variance_threshold=False, apply_random_forest=False, enable_categorical = False):
    train_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_final.csv')
    test_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test.csv')
    train_data = train_data_og.copy()
    test_data = test_data_og.copy()
    train_data = train_data.drop(columns=['prod_substance'])
    test_data = test_data.drop(columns=['prod_substance'])
    train_data = train_data.drop(columns=['sample_name'])
    test_data = test_data.drop(columns=['sample_name'])
    non_wavelength_cols = ['device_serial', 'substance_form_display', 'measure_type_display']
    wavelength_cols = train_data.columns[5:]
    
    # Remove NaN values
    train_data = train_data.dropna()
    test_data = test_data.dropna()
    
    if apply_one_hot:
        # One Hot encoding 
        encoder = OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore')
        X_train_encoded = encoder.fit_transform(train_data[non_wavelength_cols])
        X_test_encoded = encoder.transform(test_data[non_wavelength_cols])
        
        # Convert encoded features to DataFrame
        X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        
        train_data_combined = pd.concat([pd.DataFrame(X_train_encoded_df), train_data[wavelength_cols].reset_index(drop=True)], axis=1)
        test_data_combined = pd.concat([pd.DataFrame(X_test_encoded_df), test_data[wavelength_cols].reset_index(drop=True)], axis=1)
    else:
        train_data_combined = train_data
        test_data_combined = test_data  
        
    if apply_remove_outliers:
        
        non_outlier_indices = remove_outliers_mahalanobis(train_data_combined[wavelength_cols]).index
        train_data_combined = train_data_combined.loc[non_outlier_indices].reset_index(drop=True)
        print(f"After Mahalanobis outlier removal, train data shape: {train_data_combined.shape}") 
            
    if apply_scaling:
         # Standardisers
        train_data_std = StandardScaler().fit(train_data_combined[wavelength_cols].values)

        # Standardise the data
        wavelength_train_scaled, wavelength_test_scaled = map(
            lambda data, std_mach: std_mach.transform(data),
            [
                train_data_combined[wavelength_cols].values,
                test_data_combined[wavelength_cols].values,
            ],
            [train_data_std, train_data_std],
        )     
        
        train_data_combined[wavelength_cols] = pd.DataFrame(wavelength_train_scaled, columns=wavelength_cols)
        test_data_combined[wavelength_cols] = pd.DataFrame(wavelength_test_scaled, columns=wavelength_cols)   
    
    if enable_categorical:
        # Convertir les colonnes en type `category` pour XGBoost
        for col in non_wavelength_cols:
            if col in train_data.columns:
                train_data[col] = train_data[col].astype('category')
            if col in test_data.columns:
                test_data[col] = test_data[col].astype('category')
    else:
        # Encodage des colonnes catégoriques avec LabelEncoder
        for col in non_wavelength_cols:
            if col in train_data.columns:
                le = LabelEncoder()
                train_data[col] = le.fit_transform(train_data[col].astype(str))
                test_data[col] = le.transform(test_data[col].astype(str)) 
        
    if apply_pca:
        # Perform PCA on scaled wavelength columns
        pca = PCA(n_components=5)
        wavelength_cols = train_data_combined.columns[54:]
        
        X_train_pca = pca.fit_transform(train_data_combined[wavelength_cols])
        X_test_pca = pca.transform(test_data_combined[wavelength_cols])

        # Combine PCA components with original data
        X_train_combined = pd.concat([train_data_combined.iloc[:, :54].reset_index(drop=True), 
                                      pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        X_test_combined = pd.concat([test_data_combined.iloc[:, :54].reset_index(drop=True), 
                                     pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        
        train_data_combined = X_train_combined
        test_data_combined = X_test_combined
        
    """ if apply_random_forest:
        # Apply Random Forest for feature selection
        wavelength_cols = train_data_combined.columns[50:] 
        rf = RandomForestRegressor(n_estimators=300, random_state=42)
        y_train = train_data['PURITY'].iloc[train_data_combined.index]
        rf.fit(train_data_combined[wavelength_cols], y_train)
        
        # Select features based on importance
        selector = SelectFromModel(rf, threshold="mean", prefit=True)
        train_data_combined = pd.DataFrame(selector.transform(train_data_combined[wavelength_cols]), 
                                           columns=train_data_combined[wavelength_cols].columns[selector.get_support()])
        test_data_combined = pd.DataFrame(selector.transform(test_data_combined[wavelength_cols]), 
                                          columns=test_data_combined[wavelength_cols].columns[selector.get_support()])
        print(f"Shape after Random Forest feature selection: {train_data_combined.shape}")  """
    
    if apply_variance_threshold:
        # Apply VarianceThreshold
        selector = VarianceThreshold(threshold=0.05)
        train_data_combined = pd.DataFrame(selector.fit_transform(train_data_combined), columns=train_data_combined.columns[selector.get_support(indices=True)])
        test_data_combined = pd.DataFrame(selector.transform(test_data_combined), columns=test_data_combined.columns[selector.get_support(indices=True)])
        print(f"Shape after VarianceThreshold: {train_data_combined.shape}")
    
    #Aussi tester Random forest à la placde de correlation matrix
    if apply_correlation:
    # Compute correlation matrix only for wavelength columns
        #wavelength_cols = train_data_combined.columns[50:] 
        #correlation_matrix = train_data_combined[wavelength_cols].corr()
        correlation_matrix = train_data_combined.corr()

        # Visualize correlation matrix
        plt.figure(figsize=(12, 10))
        sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
        plt.title("Correlation Matrix for All Features")
        plt.show()

        # Identify highly correlated features (e.g., |r| > 0.999)
        threshold_high = 0.9999

        print(f"Number of features before removing highly correlated features: {train_data_combined.shape[1]}")
        
        high_corr_pairs = [
            (i, j)
            for i in range(correlation_matrix.shape[0])
            for j in range(i + 1, correlation_matrix.shape[1])
            if abs(correlation_matrix.iloc[i, j]) > threshold_high
        ]
        
        features_to_drop = set()
        for i, j in high_corr_pairs:
            features_to_drop.add(correlation_matrix.columns[j])  # Arbitrarily drop the second feature in the pair

        # Remove the selected features
        train_data_combined = train_data_combined.drop(columns=list(features_to_drop))
        test_data_combined = test_data_combined.drop(columns=list(features_to_drop))
        
        #wavelength_cols = train_data_combined.columns[50:]
        
        print(f"Number of features after removing highly correlated features: {train_data_combined.shape[1]}")
        """ print("Highly correlated features:")
        for i, j in high_corr_pairs:
            print(f"{correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]}") """
        
    if apply_random_forest:
        
        wavelength_cols = train_data_combined.columns[50:]
        X_train_rf = train_data_combined[wavelength_cols]
        y_train_rf = train_data['PURITY'].iloc[train_data_combined.index].squeeze()

        # Create and train random forest model
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train_rf, y_train_rf)

        # Perform feature selection using the specified threshold
        sfm = SelectFromModel(rf_model, threshold=0.0048, prefit=True)
        selected_features = sfm.get_support()
        selected_feature_names = X_train_rf.columns[selected_features]

        # Apply feature selection to train and test data
        train_data_selected = train_data_combined[selected_feature_names]
        test_data_selected = test_data_combined[selected_feature_names]

        # Add back non-wavelength columns if needed
        non_wavelength_cols = [col for col in train_data_combined.columns if col not in wavelength_cols]
        train_data_combined = pd.concat([train_data_combined[non_wavelength_cols], train_data_selected], axis=1)
        test_data_combined = pd.concat([test_data_combined[non_wavelength_cols], test_data_selected], axis=1)

        print(f"Selected {len(selected_feature_names)} features using Random Forest with threshold {0.0048}.")

        # Evaluate feature importances
        feature_importances = rf_model.feature_importances_

        plt.figure(figsize=(10, 6))
        plt.bar(range(len(feature_importances)), feature_importances)
        plt.title("Feature Importances")
        plt.xlabel("Feature Index")
        plt.ylabel("Importance Score")
        plt.show()

        """ # Test different thresholds
        thresholds = [0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.0048, 0.005, 0.0052]
        cross_val_scores = []

        for threshold in thresholds:
            # Select features based on threshold
            sfm = SelectFromModel(rf_model, threshold=threshold, prefit=True)
            selected_features = sfm.get_support()
            selected_feature_names = X_train_rf.columns[selected_features]

            # Subset the dataset
            X_train_selected = X_train_rf[selected_feature_names]

            # Compute cross-validation scores
            scores = cross_val_score(rf_model, X_train_selected, y_train_rf, cv=5, scoring='r2')
            mean_score = scores.mean()
            cross_val_scores.append(mean_score)

            print(f"Threshold: {threshold}")
            print(f"Number of selected features: {len(selected_feature_names)}")
            print(f"Cross-validated R^2 score: {mean_score:.4f}")

        # Plot cross-validated R^2 scores vs. thresholds
        plt.figure(figsize=(8, 5))
        plt.plot(thresholds, cross_val_scores, marker='o')
        plt.title("Cross-Validated R^2 Score vs. Threshold")
        plt.xlabel("Threshold")
        plt.ylabel("Mean R^2 Score")
        plt.grid()
        plt.show() """


        """ # we obtain the names of the unwanted features
        dropped_feature_names = X_train_rf.columns[feature_indices]
        
        train_data_combined = train_data_combined.drop(columns=dropped_feature_names)
        test_data_combined = test_data_combined.drop(columns=dropped_feature_names) """

            
    # Add sample_name column back to the combined DataFrames
    train_data_combined.insert(0, 'sample_name', train_data_og['sample_name'])
    test_data_combined.insert(0, 'sample_name', test_data_og['sample_name'])
    y_train = train_data['PURITY'].iloc[train_data_combined.index]

    print(f"Shape of OG train data: {train_data_og.shape}")
    print(f"Shape of OG test data: {test_data_og.shape}")
    print(f"Shape of train data: {train_data_combined.shape}")
    print(f"Shape of test data: {test_data_combined.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    
    train_data_combined.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_data_combined.csv', index=False)
    print('Submission file saved successfully.')
    test_data_combined.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test_data_combined.csv', index=False)
    print('Submission file saved successfully.')
            
    return train_data_combined, test_data_combined, y_train

def preprocessing_v0(apply_one_hot=False, apply_scaling=False, apply_pca=False, apply_correlation=False, apply_remove_outliers=False, apply_variance_threshold=False, apply_random_forest=False, enable_categorical=False):
    train_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_final.csv')
    test_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test.csv')
    train_data = train_data_og.copy()
    test_data = test_data_og.copy()
    train_data = train_data.drop(columns=['prod_substance'])
    test_data = test_data.drop(columns=['prod_substance'])
    
    non_wavelength_cols = ['device_serial', 'substance_form_display', 'measure_type_display']
    wavelength_cols = train_data.columns[5:]
    
    # Remove NaN values
    train_data = train_data.dropna()
    test_data = test_data.dropna()
    
    if apply_one_hot:
        # One Hot encoding 
        encoder = OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore')
        X_train_encoded = encoder.fit_transform(train_data[non_wavelength_cols])
        X_test_encoded = encoder.transform(test_data[non_wavelength_cols])
        
        # Convert encoded features to DataFrame
        X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        
        train_data_combined = pd.concat([pd.DataFrame(X_train_encoded_df), train_data[wavelength_cols].reset_index(drop=True)], axis=1)
        test_data_combined = pd.concat([pd.DataFrame(X_test_encoded_df), test_data[wavelength_cols].reset_index(drop=True)], axis=1)
    else:
        train_data_combined = train_data
        test_data_combined = test_data  
        
    if apply_remove_outliers:
        
        non_outlier_indices = remove_outliers_mahalanobis(train_data_combined[wavelength_cols]).index
        train_data_combined = train_data_combined.loc[non_outlier_indices].reset_index(drop=True)
        print(f"After Mahalanobis outlier removal, train data shape: {train_data_combined.shape}") 
            
    if apply_scaling:
         # Standardisers
        train_data_std = StandardScaler().fit(train_data_combined[wavelength_cols].values)

        # Standardise the data
        wavelength_train_scaled, wavelength_test_scaled = map(
            lambda data, std_mach: std_mach.transform(data),
            [
                train_data_combined[wavelength_cols].values,
                test_data_combined[wavelength_cols].values,
            ],
            [train_data_std, train_data_std],
        )     
        
        train_data_combined[wavelength_cols] = pd.DataFrame(wavelength_train_scaled, columns=wavelength_cols)
        test_data_combined[wavelength_cols] = pd.DataFrame(wavelength_test_scaled, columns=wavelength_cols)   
        
    if apply_pca:
        # Perform PCA on scaled wavelength columns
        pca = PCA(n_components=5)
        wavelength_cols = train_data_combined.columns[54:]
        
        X_train_pca = pca.fit_transform(train_data_combined[wavelength_cols])
        X_test_pca = pca.transform(test_data_combined[wavelength_cols])

        # Combine PCA components with original data
        X_train_combined = pd.concat([train_data_combined.iloc[:, :54].reset_index(drop=True), 
                                      pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        X_test_combined = pd.concat([test_data_combined.iloc[:, :54].reset_index(drop=True), 
                                     pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        
        train_data_combined = X_train_combined
        test_data_combined = X_test_combined
        
    """ if apply_random_forest:
        # Apply Random Forest for feature selection
        wavelength_cols = train_data_combined.columns[50:] 
        rf = RandomForestRegressor(n_estimators=300, random_state=42)
        y_train = train_data['PURITY'].iloc[train_data_combined.index]
        rf.fit(train_data_combined[wavelength_cols], y_train)
        
        # Select features based on importance
        selector = SelectFromModel(rf, threshold="mean", prefit=True)
        train_data_combined = pd.DataFrame(selector.transform(train_data_combined[wavelength_cols]), 
                                           columns=train_data_combined[wavelength_cols].columns[selector.get_support()])
        test_data_combined = pd.DataFrame(selector.transform(test_data_combined[wavelength_cols]), 
                                          columns=test_data_combined[wavelength_cols].columns[selector.get_support()])
        print(f"Shape after Random Forest feature selection: {train_data_combined.shape}")  """
    
    if apply_variance_threshold:
        # Apply VarianceThreshold
        selector = VarianceThreshold(threshold=0.05)
        train_data_combined = pd.DataFrame(selector.fit_transform(train_data_combined), columns=train_data_combined.columns[selector.get_support(indices=True)])
        test_data_combined = pd.DataFrame(selector.transform(test_data_combined), columns=test_data_combined.columns[selector.get_support(indices=True)])
        print(f"Shape after VarianceThreshold: {train_data_combined.shape}")
    
    #Aussi tester Random forest à la placde de correlation matrix
    if apply_correlation:
    # Compute correlation matrix only for wavelength columns
        #wavelength_cols = train_data_combined.columns[50:] 
        #correlation_matrix = train_data_combined[wavelength_cols].corr()
        correlation_matrix = train_data_combined.corr()

        # Visualize correlation matrix
        plt.figure(figsize=(12, 10))
        sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
        plt.title("Correlation Matrix for All Features")
        plt.show()

        # Identify highly correlated features (e.g., |r| > 0.999)
        threshold_high = 0.9999

        print(f"Number of features before removing highly correlated features: {train_data_combined.shape[1]}")
        
        high_corr_pairs = [
            (i, j)
            for i in range(correlation_matrix.shape[0])
            for j in range(i + 1, correlation_matrix.shape[1])
            if abs(correlation_matrix.iloc[i, j]) > threshold_high
        ]
        
        features_to_drop = set()
        for i, j in high_corr_pairs:
            features_to_drop.add(correlation_matrix.columns[j])  # Arbitrarily drop the second feature in the pair

        # Remove the selected features
        train_data_combined = train_data_combined.drop(columns=list(features_to_drop))
        test_data_combined = test_data_combined.drop(columns=list(features_to_drop))
        
        #wavelength_cols = train_data_combined.columns[50:]
        
        print(f"Number of features after removing highly correlated features: {train_data_combined.shape[1]}")
        """ print("Highly correlated features:")
        for i, j in high_corr_pairs:
            print(f"{correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]}") """
        
    if apply_random_forest:
        
        wavelength_cols = train_data_combined.columns[50:]
        X_train_rf = train_data_combined[wavelength_cols]
        y_train_rf = train_data['PURITY'].iloc[train_data_combined.index].squeeze()

        # Create and train random forest model
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train_rf, y_train_rf)

        # Perform feature selection using the specified threshold
        sfm = SelectFromModel(rf_model, threshold=0.0048, prefit=True)
        selected_features = sfm.get_support()
        selected_feature_names = X_train_rf.columns[selected_features]

        # Apply feature selection to train and test data
        train_data_selected = train_data_combined[selected_feature_names]
        test_data_selected = test_data_combined[selected_feature_names]

        # Add back non-wavelength columns if needed
        non_wavelength_cols = [col for col in train_data_combined.columns if col not in wavelength_cols]
        train_data_combined = pd.concat([train_data_combined[non_wavelength_cols], train_data_selected], axis=1)
        test_data_combined = pd.concat([test_data_combined[non_wavelength_cols], test_data_selected], axis=1)

        print(f"Selected {len(selected_feature_names)} features using Random Forest with threshold {0.0048}.")

        # Evaluate feature importances
        feature_importances = rf_model.feature_importances_

        plt.figure(figsize=(10, 6))
        plt.bar(range(len(feature_importances)), feature_importances)
        plt.title("Feature Importances")
        plt.xlabel("Feature Index")
        plt.ylabel("Importance Score")
        plt.show()

        """ # Test different thresholds
        thresholds = [0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.0048, 0.005, 0.0052]
        cross_val_scores = []

        for threshold in thresholds:
            # Select features based on threshold
            sfm = SelectFromModel(rf_model, threshold=threshold, prefit=True)
            selected_features = sfm.get_support()
            selected_feature_names = X_train_rf.columns[selected_features]

            # Subset the dataset
            X_train_selected = X_train_rf[selected_feature_names]

            # Compute cross-validation scores
            scores = cross_val_score(rf_model, X_train_selected, y_train_rf, cv=5, scoring='r2')
            mean_score = scores.mean()
            cross_val_scores.append(mean_score)

            print(f"Threshold: {threshold}")
            print(f"Number of selected features: {len(selected_feature_names)}")
            print(f"Cross-validated R^2 score: {mean_score:.4f}")

        # Plot cross-validated R^2 scores vs. thresholds
        plt.figure(figsize=(8, 5))
        plt.plot(thresholds, cross_val_scores, marker='o')
        plt.title("Cross-Validated R^2 Score vs. Threshold")
        plt.xlabel("Threshold")
        plt.ylabel("Mean R^2 Score")
        plt.grid()
        plt.show() """


        """ # we obtain the names of the unwanted features
        dropped_feature_names = X_train_rf.columns[feature_indices]
        
        train_data_combined = train_data_combined.drop(columns=dropped_feature_names)
        test_data_combined = test_data_combined.drop(columns=dropped_feature_names) """

    if enable_categorical:
        # Convertir les colonnes en type `category` pour XGBoost
        for col in non_wavelength_cols:
            if col in train_data.columns:
                train_data[col] = train_data[col].astype('category')
            if col in test_data.columns:
                test_data[col] = test_data[col].astype('category')
    else:
        # Encodage des colonnes catégoriques avec LabelEncoder
        for col in non_wavelength_cols:
            if col in train_data.columns:
                le = LabelEncoder()
                train_data[col] = le.fit_transform(train_data[col].astype(str))
                test_data[col] = le.transform(test_data[col].astype(str)) 
        
            
    # Add sample_name column back to the combined DataFrames
    #train_data_combined.insert(0, 'sample_name', train_data_og['sample_name'])
    #test_data_combined.insert(0, 'sample_name', test_data_og['sample_name'])
    y_train = train_data['PURITY'].iloc[train_data_combined.index]

    print(f"Shape of OG train data: {train_data_og.shape}")
    print(f"Shape of OG test data: {test_data_og.shape}")
    print(f"Shape of train data: {train_data_combined.shape}")
    print(f"Shape of test data: {test_data_combined.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    
    train_data_combined.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_data_combined.csv', index=False)
    print('Submission file saved successfully.')
    test_data_combined.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test_data_combined.csv', index=False)
    print('Submission file saved successfully.')
            
    return train_data_combined, test_data_combined, y_train
  
def submission_file(y_test_predicted):
    submission_reduced = pd.DataFrame({
        'ID': range(1, len(y_test_predicted) + 1),
        'PURITY': y_test_predicted
    })
    return submission_reduced

def preprocessing_vX(apply_one_hot=False, apply_scaling=True, enable_categorical=False):
    # Charger les fichiers train et test
    train_data = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train_final.csv')
    test_data = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test.csv')
    
    # Supprimer les colonnes inutiles
    non_wavelength_cols = ['sample_name', 'device_serial', 'substance_form_display', 'measure_type_display']
    train_data = train_data.drop(columns=['prod_substance'], errors='ignore')
    test_data = test_data.drop(columns=['prod_substance'], errors='ignore')
    
    # Identifier les colonnes des longueurs d'onde
    wavelength_cols = train_data.columns[6:]

    # Gérer les colonnes catégoriques
    if enable_categorical:
        # Convertir les colonnes en type `category` pour XGBoost
        for col in non_wavelength_cols:
            if col in train_data.columns:
                train_data[col] = train_data[col].astype('category')
            if col in test_data.columns:
                test_data[col] = test_data[col].astype('category')
    else:
        # Encodage des colonnes catégoriques avec LabelEncoder
        for col in non_wavelength_cols:
            if col in train_data.columns:
                le = LabelEncoder()
                train_data[col] = le.fit_transform(train_data[col].astype(str))
                test_data[col] = le.transform(test_data[col].astype(str))

    # Normalisation des colonnes numériques (longueurs d'onde)
    if apply_scaling:
        scaler = StandardScaler()
        train_data[wavelength_cols] = scaler.fit_transform(train_data[wavelength_cols])
        test_data[wavelength_cols] = scaler.transform(test_data[wavelength_cols])

    y_train = train_data['PURITY']
    train_data = train_data.drop(columns=['PURITY'])

    return train_data, test_data, y_train

def xg_boost():
    
    X_train, X_test, y_train = preprocessing_vX(apply_one_hot=False, apply_scaling=True, enable_categorical = True)

    #X_train = X_train.drop(columns=['sample_name'])
    #X_test = X_test.drop(columns=['sample_name'])
    target_column = 'PURITY'


    param_grid = {
        'colsample_bytree': np.linspace(0.5, 1, 5),
        'subsample': np.linspace(0.5, 1, 5),
        'max_depth': np.arange(2, 7, 1)
    }


    model = XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42,enable_categorical=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Create submission DataFrame
    submission = submission_file(y_pred)
    
    # Save submission to CSV
    submission.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/sample_submission_XGB_substances.csv', index=False)
    print('Submission file saved successfully.')   


def linear_regression(feature_importance=False, apply_y_transformation=False):
    X_train, X_test, y_train = preprocessing_vX(apply_one_hot=True, apply_scaling=True)
    X_train = X_train.drop(columns=['sample_name'])
    X_test = X_test.drop(columns=['sample_name'])
    
    if feature_importance:
        X_train, X_test = calculate_feature_importance(X_train, y_train, X_test)
        
    if apply_y_transformation:
        y_train = apply_log_transformation(y_train)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    mse = mean_squared_error(y_train, y_train_pred)
    print('Training MSE:', mse)
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print('CV MSE:', -cv_scores.mean())
    
    # Predict on test data
    y_test_pred = model.predict(X_test)
    
    # Create submission DataFrame
    submission = submission_file(y_test_pred)
    
    # Save submission to CSV
    submission.to_csv('/Users/maelysclerget/Desktop/ML/bio322_project/epfl-bio-322-2024/sample_submission_LR.csv', index=False)
    print('Submission file saved successfully.')

def main():
    #add_substance_form_display()
    #rename_and_modify_substance_column()
    #device_serial_proba()
    #add_device_serial()
    #add_sample_name()
    #merge_csv_files()
    #preprocessing_v1()
    #xg_boost()
    linear_regression()
    
if __name__ == '__main__':
    main() 


ValueError: y contains previously unseen labels: '19.0278-P012'