La première partie du projet sera liée à l'étude des data


In [70]:
# def knn():
# PURITY log transform  
import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt


def inspect_data_grouped_and_save(file_path):
    """
    Inspecte les données d'un fichier CSV en regroupant les colonnes
    par groupes de 5 pour les colonnes numériques et sauvegarde les graphiques
    dans un dossier 'data_visualisation'.
    
    Args:
        file_path (str): Chemin vers le fichier CSV à analyser.
    """
    # Charger les données
    data = pd.read_csv(file_path)
    
    # Créer un dossier pour sauvegarder les graphiques
    output_dir = 'data_visualisation'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Répertoire créé : {output_dir}")
    
    # Vérifier les colonnes catégoriques
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns
    
    # Vérifier les colonnes numériques
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
    
    # Regrouper les colonnes numériques par groupes de 5 et sauvegarder les graphiques
    for i in range(0, len(numerical_columns), 5):
        cols_group = numerical_columns[i:i+5]
        plt.figure(figsize=(12, 8))
        
        for col in cols_group:
            sns.kdeplot(data[col], label=col, fill=True, alpha=0.4)
        
        plt.title(f"Distribution groupée des colonnes : {', '.join(cols_group)}")
        plt.xlabel("Valeur")
        plt.ylabel("Densité")
        plt.legend(title="Colonnes")
        
        # Sauvegarder le graphique
        graph_path = os.path.join(output_dir, f"distribution_group_{i//5 + 1}.png")
        plt.savefig(graph_path)
        plt.close()
        print(f"Graphique sauvegardé : {graph_path}")
    
    # Répartition des colonnes catégoriques et sauvegarde des graphiques
    for col in categorical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=col, data=data, palette='viridis')
        plt.title(f"Répartition des catégories dans {col}")
        plt.xlabel(col)
        plt.ylabel("Fréquence")
        plt.xticks(rotation=45)
        
        # Sauvegarder le graphique
        graph_path = os.path.join(output_dir, f"category_distribution_{col}.png")
        plt.savefig(graph_path)
        plt.close()
        print(f"Graphique sauvegardé : {graph_path}")

    # Inspecter les relations entre les colonnes numériques et une cible potentielle (ex. PURITY)
    if 'PURITY' in data.columns:
        for i in range(0, len(numerical_columns), 5):
            cols_group = numerical_columns[i:i+5]
            plt.figure(figsize=(12, 8))
            
            for col in cols_group:
                if col != 'PURITY':  # Ne pas tracer la cible elle-même
                    sns.scatterplot(x=data[col], y=data['PURITY'], label=col, alpha=0.7)
            
            plt.title(f"Relation entre PURITY et les colonnes : {', '.join(cols_group)}")
            plt.xlabel("Valeur des colonnes")
            plt.ylabel("PURITY")
            plt.legend(title="Colonnes")
            
            # Sauvegarder le graphique
            graph_path = os.path.join(output_dir, f"purity_relationship_group_{i//5 + 1}.png")
            plt.savefig(graph_path)
            plt.close()
            print(f"Graphique sauvegardé : {graph_path}")

    print("\nInspection des données terminée. Tous les graphiques ont été sauvegardés dans le dossier 'data_visualisation'.")
    
    
    



def preprocessing_v1(apply_one_hot=False, apply_scaling=True, apply_pca=False, apply_correlation=False, apply_remove_outliers=True):
    train_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train.csv')
    test_data_og = pd.read_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/test.csv')
    train_data = train_data_og.copy()
    test_data = test_data_og.copy()
    train_data = train_data.drop(columns=['prod_substance'])
    test_data = test_data.drop(columns=['prod_substance'])
    
    non_wavelength_cols = ['device_serial', 'substance_form_display', 'measure_type_display']
    wavelength_cols = train_data.columns[6:]
    
    if apply_one_hot:
        # One Hot encoding 
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        X_train_encoded = encoder.fit_transform(train_data[non_wavelength_cols])
        X_test_encoded = encoder.transform(test_data[non_wavelength_cols])
        
        # Convert encoded features to DataFrame
        X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(non_wavelength_cols))
        
        train_data_combined = pd.concat([pd.DataFrame(X_train_encoded_df), train_data[wavelength_cols].reset_index(drop=True)], axis=1)
        test_data_combined = pd.concat([pd.DataFrame(X_test_encoded_df), test_data[wavelength_cols].reset_index(drop=True)], axis=1)
    else:
        train_data_combined = train_data
        test_data_combined = test_data
    
    # Add sample_name column back to the combined DataFrames
    train_data_combined.insert(0, 'sample_name', train_data_og['sample_name'])
    test_data_combined.insert(0, 'sample_name', test_data_og['sample_name'])
    
    # Remove NaN values
    train_data_combined = train_data_combined.dropna()
    test_data_combined = test_data_combined.dropna()
    
    y_train = train_data['PURITY'].iloc[train_data_combined.index]
    
    #NOTE: toujours mettre scaling avec remove_outliers
    
    if apply_scaling:
        # Standardize the data
        scaler = StandardScaler()
        wavelength_train_scaled = scaler.fit_transform(train_data_combined[wavelength_cols])
        wavelength_test_scaled = scaler.transform(test_data_combined[wavelength_cols])
        
        train_data_combined[wavelength_cols] = wavelength_train_scaled
        test_data_combined[wavelength_cols] = wavelength_test_scaled
    
    if apply_remove_outliers:
        # Remove outliers
        outliers_index = (np.abs(wavelength_train_scaled) > 3).any(axis=1)
        train_data_combined = train_data_combined[~outliers_index]
        y_train = y_train[~outliers_index]
        
        train_data_combined = train_data_combined.reset_index(drop=True)
        test_data_combined = test_data_combined.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        
    if apply_pca:
        # Perform PCA on scaled wavelength columns
        pca = PCA(n_components=5)
        wavelength_cols = train_data_combined.columns[54:]
        
        X_train_pca = pca.fit_transform(train_data_combined[wavelength_cols])
        X_test_pca = pca.transform(test_data_combined[wavelength_cols])

        # Combine PCA components with original data
        X_train_combined = pd.concat([train_data_combined.iloc[:, :54].reset_index(drop=True), 
                                      pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        X_test_combined = pd.concat([test_data_combined.iloc[:, :54].reset_index(drop=True), 
                                     pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(5)])], axis=1)
        
        train_data_combined = X_train_combined
        test_data_combined = X_test_combined
        
    if apply_correlation:
    # Compute correlation matrix only for wavelength columns
        wavelength_cols = train_data_combined.columns[54:] #pq t'as mis ca
        correlation_matrix = train_data_combined[wavelength_cols].corr()

        # Visualize correlation matrix
        plt.figure(figsize=(12, 10))
        sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
        plt.title("Correlation Matrix for Wavelength Features")
        plt.show()

        # Identify highly correlated features (e.g., |r| > 0.999)
        threshold_high = 0.999
        threshold_low = 0.2

        high_corr_pairs = [
            (i, j)
            for i in range(correlation_matrix.shape[0])
            for j in range(i + 1, correlation_matrix.shape[1])
            if abs(correlation_matrix.iloc[i, j]) > threshold_high
        ]
        
        features_to_drop = set()
        for i, j in high_corr_pairs:
            features_to_drop.add(wavelength_cols[j])  # Arbitrarily drop the second feature in the pair

        # Remove the selected features
        train_data_combined = train_data_combined.drop(columns=list(features_to_drop))
        test_data_combined = test_data_combined.drop(columns=list(features_to_drop))
        
        low_corr_pairs = [
            (i, j)
            for i in range(correlation_matrix.shape[0])
            for j in range(i + 1, correlation_matrix.shape[1])
            if abs(correlation_matrix.iloc[i, j]) < threshold_low
        ]

        print("Highly correlated features:")
        for i, j in high_corr_pairs:
            print(f"{wavelength_cols[i]} and {wavelength_cols[j]}: {correlation_matrix.iloc[i, j]}")
        print("Low correlated features:")
        for i, j in low_corr_pairs:
            print(f"{wavelength_cols[i]} and {wavelength_cols[j]}: {correlation_matrix.iloc[i, j]}")
    
    
    return train_data_combined, test_data_combined, y_train

   
def calculate_feature_importance(X_train, y_train, X_test, threshold=0.25):
    # Calculate feature importance using a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Print feature importance
    feature_importance = pd.Series(model.coef_, index=X_train.columns)
    feature_importance = feature_importance.abs().sort_values(ascending=False)
    wavelength_feature_importance_df = feature_importance.reset_index()
    wavelength_feature_importance_df.columns = ['Feature', 'Importance']
    wavelength_feature_importance_df.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/feature_importance_LR1.csv', index=False)
    print('Feature Importance saved successfully.')
    
    # Calculate stats threshold
    threshold_value = feature_importance.quantile(threshold)
    
    # Identify low-importance features
    low_importance_features = feature_importance[feature_importance < threshold_value].index
    print(f'Low importance features: {low_importance_features}')
    
    # Remove low-importance features
    X_train_reduced = X_train.drop(columns=low_importance_features)
    X_test_reduced = X_test.drop(columns=low_importance_features)
    
    return X_train_reduced, X_test_reduced

def submission_file(y_test_predicted):
    submission_reduced = pd.DataFrame({
        'ID': range(1, len(y_test_predicted) + 1),
        'PURITY': y_test_predicted
    })
    return submission_reduced


def linear_regression(feature_importance=False):
    X_train, X_test, y_train = preprocessing_v1(apply_one_hot=True, apply_correlation=True, apply_scaling=True, apply_remove_outliers=True)
    X_train = X_train.drop(columns=['sample_name'])
    X_test = X_test.drop(columns=['sample_name'])
    
    if feature_importance:
        X_train, X_test = calculate_feature_importance(X_train, y_train, X_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    mse = mean_squared_error(y_train, y_train_pred)
    print('Training MSE:', mse)
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print('CV MSE:', -cv_scores.mean())
    
    # Predict on test data
    y_test_pred = model.predict(X_test)
    
    # Create submission DataFrame
    submission = submission_file(y_test_pred)
    
    # Save submission to CSV
    submission.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/sample_submission_LR.csv', index=False)
    print('Submission file saved successfully.')

def polynomial_regression():
    X_train, X_test, y_train = preprocessing_v1(apply_one_hot=True, apply_scaling=True, remove_outliers=True)
    X_train = X_train.drop(columns=['sample_name'])
    X_test = X_test.drop(columns=['sample_name'])
    
    # Define the pipeline
    pipeline = Pipeline([
        ("polynomial", PolynomialFeatures()),
        ("regression", LinearRegression())
    ])
    
    # Define the parameter grid
    param_grid = {
        "polynomial__degree": np.arange(1, 4, 1)
    }
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, 
                               scoring='neg_mean_squared_error', 
                               return_train_score=True)
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    print("Best accuracy:", grid_search.best_score_)
    print("Best parameters:", grid_search.best_params_)
    
    # Predict on training data
    y_train_pred = best_model.predict(X_train)
    mse = mean_squared_error(y_train, y_train_pred)
    print('Training MSE:', mse)
    
    # Cross-validation score
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print('CV MSE:', -cv_scores.mean())
    
    # Predict on test data
    y_test_pred = best_model.predict(X_test)
    
    # Create submission DataFrame
    submission = submission_file(y_test_pred)
    
    # Save submission to CSV
    submission.to_csv('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/sample_submission_POLY.csv', index=False)
    print('Submission file saved successfully.')


def logistic_regression_with_preprocessed_data():
    """
    Implémente une régression logistique sur des données prétraitées
    avec preprocessing_v1. Évalue le modèle et génère un fichier
    de soumission.
    """
    # Appliquer le prétraitement
    X_train, X_test, y_train = preprocessing_v1(
        apply_one_hot=True,
        apply_scaling=True,
        apply_remove_outliers=True,
        apply_pca=False,  # Désactiver PCA pour garder toutes les dimensions
        apply_correlation=False  # Désactiver le nettoyage par corrélation
    )

    # Supprimer la colonne 'sample_name' inutilisée
    X_train = X_train.drop(columns=['sample_name'], errors='ignore')
    X_test = X_test.drop(columns=['sample_name'], errors='ignore')

    # Binariser la cible pour une tâche de classification binaire
    threshold = 0.5  # Seuil pour séparer les classes
    y_train_binary = (y_train > threshold).astype(int)  # 0 si ≤ seuil, 1 si > seuil

    # Initialiser et entraîner le modèle de régression logistique
    model = LogisticRegression(max_iter=1000, solver='lbfgs')
    model.fit(X_train, y_train_binary)

    # Évaluer les performances sur les données d'entraînement
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train_binary, y_train_pred)
    print("Training Accuracy:", train_accuracy)
    print("\nClassification Report (Training):")
    print(classification_report(y_train_binary, y_train_pred))

    # Calculer l'AUC (Area Under Curve)
    y_train_proba = model.predict_proba(X_train)[:, 1]
    auc = roc_auc_score(y_train_binary, y_train_proba)
    print(f"Training AUC: {auc}")

    # Prédictions sur l'ensemble de test
    y_test_pred = model.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive

    # Générer le fichier de soumission
    submission = submission_file(y_test_pred)
    output_dir = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024'
    submission_path = os.path.join(output_dir, 'sample_submission_Logistic.csv') 
    submission.to_csv(submission_path, index=False)
    print(f"Submission file saved successfully ")
    
    

def main():
    #linear_regression(feature_importance=True)
    #polynomial_regression()
    #logistic_regression_with_preprocessed_data()
    inspect_data_grouped_and_save('/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024/train.csv')
    
    

if __name__ == '__main__':
    main() 

Graphique sauvegardé : data_visualisation/distribution_group_1.png
Graphique sauvegardé : data_visualisation/distribution_group_2.png
Graphique sauvegardé : data_visualisation/distribution_group_3.png
Graphique sauvegardé : data_visualisation/distribution_group_4.png
Graphique sauvegardé : data_visualisation/distribution_group_5.png
Graphique sauvegardé : data_visualisation/distribution_group_6.png
Graphique sauvegardé : data_visualisation/distribution_group_7.png
Graphique sauvegardé : data_visualisation/distribution_group_8.png
Graphique sauvegardé : data_visualisation/distribution_group_9.png
Graphique sauvegardé : data_visualisation/distribution_group_10.png
Graphique sauvegardé : data_visualisation/distribution_group_11.png
Graphique sauvegardé : data_visualisation/distribution_group_12.png
Graphique sauvegardé : data_visualisation/distribution_group_13.png
Graphique sauvegardé : data_visualisation/distribution_group_14.png
Graphique sauvegardé : data_visualisation/distribution_gr


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=col, data=data, palette='viridis')


Graphique sauvegardé : data_visualisation/category_distribution_sample_name.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=col, data=data, palette='viridis')


Graphique sauvegardé : data_visualisation/category_distribution_device_serial.png
Graphique sauvegardé : data_visualisation/category_distribution_substance_form_display.png
Graphique sauvegardé : data_visualisation/category_distribution_measure_type_display.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=col, data=data, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=col, data=data, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=col, data=data, palette='viridis')


Graphique sauvegardé : data_visualisation/category_distribution_prod_substance.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_1.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_2.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_3.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_4.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_5.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_6.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_7.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_8.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_9.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_10.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_11.png
Graphique sauvegardé : data_visualisation/purity_relationship_group_12.png
Graphique sauvegardé : dat