In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from module import clean, enrich, PrincipalComponentAnalysis

In [17]:
#PRETRAITEMENT

# Load data
TrainingData = pd.read_csv('train.csv')
TrainingData = clean(TrainingData)
TrainingData = enrich(TrainingData) 
TrainingData = PrincipalComponentAnalysis(TrainingData, 31, 'RT')

# Split data into training and testing sets
train_data = TrainingData.iloc[:1000]
test_data = TrainingData.iloc[1001:]

# Train Random Forest regression model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
X_train = train_data.drop(['Compound', 'SMILES', 'Lab', 'RT'], axis=1)
y_train = train_data['RT']
rf.fit(X_train, y_train)

# Predict on the test set
X_test = test_data.drop(['Compound', 'SMILES', 'Lab', 'RT'], axis=1)
y_true = test_data['RT']
y_pred = rf.predict(X_test) #numpy array of predicted rt

# POST PROCESSING OF THE FOUND RT

## Dealing with the lab bias 

The retention time depends on the exact configuration of the chromatography within a particular laboratory : RT have a lab-specific biased 

We can evaluate and simulate the supposed bias introduced by the lab's measurments 


## $I$. Compute Lab mean bias

### 0. Chech if training data contains coumpounds measured by different labs

In [19]:
# Create a Mask for Duplicates: compounds duplicates have the value True
duplicate_mask = TrainingData.duplicated(subset='Compound', keep=False)

print("Duplicate Mask:")
print(duplicate_mask)

Duplicate Mask:
0       False
1       False
2        True
3       False
4        True
        ...  
3495     True
3496     True
3497    False
3498    False
3499     True
Length: 3500, dtype: bool


There are true values : TrainingData dataframe contains **duplicates** of certain coumpounds with different RT (measured by different labs).

We want to compute the mean RT for each coumpounds across all labs.

### 1. For each coumpound, we calculate the average RT across all labs (idea of the true RT of this molecule ) ON ENLEVE LES OUTLIERS ?


In [20]:
def mean_RT_for_duplicates(data):
    """
    Calculate the mean RT for compounds with duplicates and add a 'mean_RT' column to the DataFrame.

    Parameters:
    - data (pandas.DataFrame): Input DataFrame containing 'Compound' and 'RT' columns.

    Returns:
    - pandas.DataFrame: DataFrame with an additional 'mean_RT' column.
    """
    # Compute mean RT for each duplicate group
    mean_RT_values = data.groupby('Compound')['RT'].transform('mean')

    # Add mean_RT column to the DataFrame
    data['mean_RT'] = mean_RT_values

    return data

mean_RT_for_duplicates(TrainingData)
TrainingData.head()
    

Unnamed: 0,Compound,SMILES,Lab,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,RT,mean_RT
0,Hydroxytriazolam,OCc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1Cl)=NC2,CFSRE,-2.987115,-2.417335,23.823439,-10.65998,-5.286818,-2.246669,-4.166992,...,0.103468,1.018187,-1.127222,0.52151,0.244991,-0.170214,-0.721775,2.092839,7.02,7.02
1,5-MeO-DIPT,COc1ccc2[nH]cc(CCN(C(C)C)C(C)C)c2c1,Aarhus,-5.128864,2.157512,0.036157,-0.901335,10.5971,-3.408461,1.32395,...,-0.86773,-6.775252,1.58192,-5.822869,-2.67856,0.905344,-0.093351,-5.075549,4.45,4.45
2,MDMA,CNC(C)Cc1ccc2c(c1)OCO2,Ghent University,-15.090985,-7.167238,-6.118548,3.114494,2.83234,2.070118,-0.5702,...,-0.818473,-3.405627,-5.676069,2.156555,0.175257,2.063924,-1.607796,-3.224355,3.14,3.294
3,Despropionyl N-Benzyl para-Fluoro Norfentanyl,Fc1ccc(NC2CCN(Cc3ccccc3)CC2)cc1,San Francisco OCME,-4.366883,8.986132,-2.599326,2.029549,2.226723,-5.966584,2.230436,...,-4.293212,0.769573,0.417959,-5.010496,2.670682,-2.479414,-1.551006,0.044831,5.95,5.95
4,N-Ethylpentylone,CCCC(NCC)C(=O)c1ccc2c(c1)OCO2,Ghent University,-9.934253,-6.131078,-6.961427,1.916573,-5.864476,13.444401,-5.42082,...,-0.650341,1.509406,2.435432,-0.194201,0.138881,-1.839154,1.674439,2.814727,4.21,5.65


### 2. Calculate each labs mean bias 

a. For each measurement we calculate the Lab-bias compared to the mean RT

Bias (Lab) = Measured RT (Lab) − Mean RT (Compound)


b. For each laboratory, we calculate the mean bias across all its measures : this express the lab's measure tendency

In [21]:
def lab_bias_df(data) : #on trining
    """
    Calculate lab-specific biases in retention time.

    This function calculates lab-specific biases by first computing the mean
    retention time for compounds with duplicates in the provided dataset.
    It then calculates the lab-specific bias for each data point by subtracting
    the mean retention time from the actual retention time. 
    Finally, the function computes the lab-specific mean bias and 
    creates a new DataFrame containing every labs and their mean bias.

    Parameters:
    - data (pandas.DataFrame): Input (train) DataFrame containing 'RT', 'Lab', and other relevant columns.

    Returns:
    - pandas.DataFrame: DataFrame containing Labs and their lab-specific biases.
    """
    #prerequisite dataset treatment
    mean_RT_for_duplicates(data) #adds column mean rt to data
    
    #Calculate Lab-Specific Bias
    data['Bias'] = data['RT'] - data['mean_RT']
    
    # Calculate mean bias for each lab :pd serie
    mean_bias = data.groupby('Lab')['Bias'].mean().reset_index()

    # Create a new DataFrame with unique Lab values and their corresponding mean bias
    lab_bias_df = pd.DataFrame({
        'Lab': mean_bias['Lab'],
        'LabMeanBias': mean_bias['Bias']
    })
    
    return lab_bias_df

In [22]:
lab_bias_df(train_data).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Lab          24 non-null     object 
 1   LabMeanBias  24 non-null     float64
dtypes: float64(1), object(1)
memory usage: 512.0+ bytes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mean_RT'] = mean_RT_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Bias'] = data['RT'] - data['mean_RT']


## $II$. Adding the bias in the model's RT output 

In [23]:
def unbiased_RT(RTdf, test, lab_bias): # to use on the final prediction (dataframe containing only the guessed rt)
    """
    Compute the corrected Retention Time (RT) without the lab bias. if the lab is unknown, the rt is not modified.

    Parameters:
    - RTdf (pandas.DataFrame): RT predictions from the test set (dataframe containing only the guessed rt)
    - test (pandas.DataFrame): test dataframe
    - lab_bias (pandas.DataFrame): DataFrame containing Labs and their lab_mean_bias (calculated from train)

    Returns:
    - pandas.DataFrame: new df containing only the corrected RT
    """
    # create a new column 'lab_bias' in test, that contains the lab's mean bias (found in 'lab_bias' df),
    # if the lab is unknown, consider its mean bias as 0
    test['lab_bias'] = RTdf['Lab'].map(lab_bias.set_index('Lab')['LabMeanBias']).fillna(0)
    test.head()
    # create new df 'ordered_lab_bias' only containing the 'lab bias' column of test
    ordered_lab_bias = test['lab_bias']
    ordered_lab_bias.head()
    # compute the corrected RT without the lab bias
    RTdf['Corrected_RT'] = RTdf['RT'] + ordered_lab_bias
    RTdf.head()
    RTdf = RTdf.drop('RT', axis=1)
    
    return RTdf


test des fonction : ici sur l'exemple du random forest

In [29]:
# Remove lab bias 
lab_bias = lab_bias_df(TrainingData) #df containing labs and labb bias 
y_pred_df = pd.DataFrame({'Lab': test_data['Lab'], 'RT': y_pred})#df containing the labs and predicted RT from test

y_pred_df = unbiased_RT(y_pred_df, test_data, lab_bias) #df containing only the corrected rt 

y_pred_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['lab_bias'] = RTdf['Lab'].map(lab_bias.set_index('Lab')['LabMeanBias']).fillna(0)


Unnamed: 0,Lab,Corrected_RT
1001,CFSRE,9.558090
1002,Mainz,9.118091
1003,Finnish Customs Laboratory,7.928069
1004,Mainz,13.113181
1005,Copenhagen,8.434689
...,...,...
3495,CFSRE,7.687420
3496,Adelaide,4.229680
3497,University Hospital of Northern Norway,11.142313
3498,University of Athens,5.577822


In [None]:

# Evaluate model performance using mean squared error
mse = mean_squared_error(test_data['RT'], y_pred_df['Corrected_RT'])
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")


## apply this on real test data and save final file for evaluation 

### apply on real test.csv

final form for submission : df containing only the predicted RT

In [33]:
# Load data
TrainingData = pd.read_csv('train.csv')
TrainingData = clean(TrainingData)
TrainingData = enrich(TrainingData) 
TrainingData = PrincipalComponentAnalysis(TrainingData, 31, 'RT')
train_data = TrainingData

TestData = pd.read_csv('test.csv')
TestData = clean(TestData) # ATTENTION CHECK QUON DE SUPPRIME PAS DE LIGNES 
TestData = enrich(TestData)
TrainingData = PrincipalComponentAnalysis(TestData, 31, 'RT') #PAS POSSIBLE D'AVOIR RT
test_data = TestData

# Train Random Forest regression model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
X_train = train_data.drop(['Compound', 'SMILES', 'Lab', 'RT'], axis=1) #imput data
y_train = train_data['RT'] #target data
rf.fit(X_train, y_train) #build a forest of trees from X and y

# Predict on the test set
X_test = test_data.drop(['Compound', 'SMILES', 'Lab'], axis=1)
y_pred = rf.predict(X_test) #numpy array of pre

KeyError: 'RT'

In [30]:

#final form of submission 
y_pred_df.rename(columns={'Correcte_RT': 'RT'}, inplace=True) #rename the column  'RT'
y_pred_df.drop('Lab', axis=1, inplace=True)
y_pred_df


Unnamed: 0,Corrected_RT
1001,9.558090
1002,9.118091
1003,7.928069
1004,13.113181
1005,8.434689
...,...
3495,7.687420
3496,4.229680
3497,11.142313
3498,5.577822
