In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

In [2]:
data = pd.read_excel(r"C:\Users\MGUELMOUSSI\OneDrive - Dehon Group\inventec\Inventec_local\IA substitution PFAS\030723_BDD_prototype2_PFAS.xlsx")
data = data.loc[:,['Inchi', 'PFAS oui/non', "Point d'ébullition", 'viscosité cinematique en Cst', 'tension de surface (dyn/cm)', 'GWP', 'toxicité']]
data.head()

Unnamed: 0,Inchi,PFAS oui/non,Point d'ébullition,viscosité cinematique en Cst,tension de surface (dyn/cm),GWP,toxicité
0,1S/C2H2Cl2/c3-1-2-4/h1-2H/b2-1+,0,48.0,0.32,14.0,0.0,4
1,"1S/C5H3F9O/c1-15-5(13,14)2(6,3(7,8)9)4(10,11)1...",1,61.0,0.3,14.0,320.0,5
2,"1S/C6H5F9O/c1-2-16-6(14,15)3(7,4(8,9)10)5(11,1...",1,76.0,0.43,14.0,57.0,5
3,"1S/C6F12O/c7-2(4(10,11)12,5(13,14)15)1(19)3(8,...",1,49.0,0.4,11.0,1.0,5
4,"1S/C5F12O/c6-1(7,2(8,9)10)4(14,15)18-5(16,17)3...",1,170.0,1.81,17.0,7800.0,5


In [3]:
def custom_encode(inchi_string):
    # Split the InChI string into characters and numbers
    tokens = []
    current_token = inchi_string[0]
    for char in inchi_string[1:]:
        if char.isalpha() == current_token[0].isalpha():
            current_token += char
        else:
            tokens.append(current_token)
            current_token = char
    tokens.append(current_token)
    
    # Encode the tokens into numerical values
    encoded_tokens = []
    for token in tokens:
        if token[0].isalpha():
            encoded_tokens.append(ord(token[0]))
        else:
            # Handle characters that are not alphabetic or numeric
            encoded_tokens.extend([ord(char) for char in token])
    
    return encoded_tokens

encoded_inchi = custom_encode('1S/C2H2Cl2/c3-1-2-4/h1-2H/b2-1+')
type(encoded_inchi)

list

In [4]:
def custom_decode(encoded_tokens):
    # Initialize variables
    decoded_string = ""
    current_token = ""

    # Iterate through the encoded tokens
    for i, token in enumerate(encoded_tokens):
        if token >= ord('A') and token <= ord('Z'):
            # If the token is in uppercase, it represents a character
            current_token += chr(token)
        else:
            # If the token is a number (or other ASCII value), it's part of the previous character
            current_token += chr(token)
        
        # Check if this is the last token or if the next token is a digit
        if i == len(encoded_tokens) - 1 or (i < len(encoded_tokens) - 1 and encoded_tokens[i + 1] >= ord('0') and encoded_tokens[i + 1] <= ord('9')):
            # Add the completed token to the decoded string
            decoded_string += current_token
            current_token = ""
    
    return decoded_string

decoded_string = custom_decode(encoded_inchi)
print("Original InChI:", '1S/C2H2Cl2/c3-1-2-4/h1-2H/b2-1+')
print("Decoded InChI:", decoded_string)

Original InChI: 1S/C2H2Cl2/c3-1-2-4/h1-2H/b2-1+
Decoded InChI: 1S/C2H2C2/c3-1-2-4/h1-2H/b2-1+


In [10]:
len('1S/C11H3F19O2/c1-31-7(3(13,10(25,26)27)11(28,29)30)5(16,17)4(14,15)6(18,32-7)2(12,8(19,20)21)9(22,23)24/h1H3')

108

In [5]:
df = pd.DataFrame()

# Apply custom_encode to the "Inchi" column and store the results in inchi_df
df['Encoded_Inchi'] = data['Inchi'].apply(custom_encode)

max_columns = max(len(row) for row in df['Encoded_Inchi'])

# Expand the "Encoded_Inchi" column into separate columns with NaN for missing values
inchi_df = df['Encoded_Inchi'].apply(lambda x: x + [None] * (max_columns - len(x)))
inchi_df = pd.DataFrame(inchi_df.tolist(), columns=[f'Column_{i+1}' for i in range(max_columns)])

inchi_df = inchi_df.fillna(0)
inchi_df.head()

Unnamed: 0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_99,Column_100,Column_101,Column_102,Column_103,Column_104,Column_105,Column_106,Column_107,Column_108
0,49,83,47,67,50,72,50,67,50,47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,83,47,67,53,72,51,70,57,79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,83,47,67,54,72,53,70,57,79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,83,47,67,54,70,49,50,79,47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,83,47,67,53,70,49,50,79,47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X = data.drop(columns=["Inchi"])
y = inchi_df

In [15]:
inchi_df.columns

Index(['Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5', 'Column_6',
       'Column_7', 'Column_8', 'Column_9', 'Column_10',
       ...
       'Column_99', 'Column_100', 'Column_101', 'Column_102', 'Column_103',
       'Column_104', 'Column_105', 'Column_106', 'Column_107', 'Column_108'],
      dtype='object', length=108)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Choose a model (Random Forest Regressor)
model = RandomForestRegressor()

model.fit(X_train, y_train)

RandomForestRegressor()

In [18]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 7.015034722222223


In [24]:
test_df = pd.read_excel(r"C:\Users\MGUELMOUSSI\OneDrive - Dehon Group\inventec\Inventec_local\IA substitution PFAS\test_substitution.xlsx")
test_df = test_df.loc[:,['PFAS oui/non', "Point d'ébullition", 'viscosité cinematique en Cst', 'tension de surface (dyn/cm)', 'GWP', 'toxicité']]
test_df

Unnamed: 0,PFAS oui/non,Point d'ébullition,viscosité cinematique en Cst,tension de surface (dyn/cm),GWP,toxicité
0,0,185.0,0.41,22.3,120,1
1,0,92.8,0.37,23.0,4600,4


In [38]:
test_pred = model.predict(test_df)

In [39]:
decoded_values = []
for row in test_pred:
    decoded_row = [custom_decode([int(val) for val in row])]
    decoded_values.append(decoded_row)

# Convert the result to a NumPy array
decoded_values_array = np.array(decoded_values)  
decoded_values_array


array([['1S/C3=@@;5667.85-("\x19\x1c\x17\x1c\x19\x19\x1d\x14\x16\x15\x12\x14\x12\x12\x14\x13\x12\x14\x13\x11\x14\x12\x12\x12\x15\x13\x13\x15\x11\x12\x13\x11\x13\x11\x12\x14\x12\x13\x14\x12\x12\x11\x11\x13\x12\x12\x14\x11\x14\x12\x12\x14\x10\x11\x12\x11\x11\x10\x12\x0f\x19\x12\x14\x13\x0f\x11\x16\x10\n\n\x08\n\n\x08\x0b\x08\n\n\t\n\n\x08\n\n\t\x15\n\x0f\n'],
       ["1S/C2D8E4?6C3-2-4),''%:'-,\x11\x11\x10\x0c\t\t\x08\n\x08\t\t\x08\x08\x08\x08\x08\x08\t\x08\x08\x08\x08\n\x08\x08\t\x07\x07\x07\x06\x07\x07\x07\x07\x06\x06\x07\x06\x06\x07\x06\n\x06\x07\x07\x03\x03\x04\x03\x03\x03\x04\x03\x05\x04\x04\x04\x03\x03\x05\x03\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x05\x02\x03\x02"]],
      dtype='<U108')

In [19]:
# Perform cross-validation (e.g., 5-fold cross-validation)
scores = cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=5)
mae_scores = -scores  # Convert negative scores to positive MAE

In [20]:
# Calculate the mean MAE from cross-validation
mean_mae = mae_scores.mean()
print("Mean Cross-Validated MAE:", mean_mae)

Mean Cross-Validated MAE: 8.587084986772485
