In [1]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import os

In [2]:


class MoleculeProcessor:
    def __init__(self, smiles, dtx):
        self.mol = Chem.MolFromSmiles(smiles)
        self.dtx = dtx
    
    def chain_length(self, ch=30):
        mysr = 'C(F)(F)'
        mylst = []
        for n in range(1, ch):
            a = self.mol.HasSubstructMatch(Chem.MolFromSmarts(''.join(mysr * n)))
            mylst.append(a)
        return mylst.index(False)

    def get_morgan_fingerprint(self, radius = 3, nBits=1024):
        mgrn_df = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(self.mol,radius,nBits))] )
        mgrn_df.columns = ['mrgn_%d'%i for i in mgrn_df.columns]
        mgrn_df.index = [self.dtx]
        return mgrn_df

class CategoryManager:
    def __init__(self, categories):
        self.categories = categories
        
    def make_df(self, smiles, dtx, cat='others'):
        '''
    Construct the appropriate input df to facilitate RF predictions
        '''
        if cat not in self.categories:
            raise ValueError(f"Category {cat} is not recognized.") 
        mol_processor = MoleculeProcessor(smiles, dtx)
        df1 = mol_processor.get_morgan_fingerprint()
        df1['category'] = cat
        df1['chain_length'] = mol_processor.chain_length()
        df1['category'] = df1['category'].astype('category')
        return df1
    


categories = ['Aromatic PFASs',
 'HFCs',
 'Other PFASs',
 'Other PFASs, cyclic',
 'PASF-based substances',
 'PFAA precursors',
 'PFAA precursors, cyclic',
 'PFAAs',
 'PFAAs, cyclic',
 'PolyFCA derivatives',
 'Polyfluoroalkanes',
 'Polyfluoroalkyl acids',
 'Polyfluoroalkyl acids, cyclic',
 'Si PFASs',
 'n:2 fluorotelomer-based substances',
 'others',
 'others, cyclic',
 'unclassified']




In [53]:
class CategoryPrediction:
    def __init__(self, rev_dict='reverse_dict2.pkl', rf_model='final_model_v2.sav' ):

        #script_dir = os.path.dirname(__file__)
        # Construct the full path to the files
        #rev_dict_full_path = os.path.join(script_dir, rev_dict)
        #rf_model_full_path = os.path.join(script_dir, rf_model)

        self.rev_dict = pickle.load(open(rev_dict, 'rb'))
        self.rf_model = pickle.load(open(rf_model, 'rb'))

    def make_prediction(self, df):
        '''
        Make prediction using the RF model and convert back into the original terminal categories
        '''
        pred = self.rf_model.predict(df)[0]
        term_cat = self.rev_dict[pred]
    
        return term_cat


In [36]:
file = open('reverse_dict2.pkl','rb')

In [38]:

pickle.load(file)

In [49]:
rev_dict

'reverse_dict.pkl'

In [50]:
rev_dict = 'reverse_dict2.pkl'
rf_model ='final_model_v2.sav'

In [11]:
file1 = open('final_model_v2.sav','rb')
rf_model= pickle.load(file1)

In [39]:
category_manager = CategoryManager(categories)



df = category_manager.make_df('CCCN(CCNC(=O)c1ccc(Cc2ccc(C(O)=O)cc2)cc1)S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F', dtx = '123', cat="Aromatic PFASs")



In [40]:
df

Unnamed: 0,mrgn_0,mrgn_1,mrgn_2,mrgn_3,mrgn_4,mrgn_5,mrgn_6,mrgn_7,mrgn_8,mrgn_9,...,mrgn_1016,mrgn_1017,mrgn_1018,mrgn_1019,mrgn_1020,mrgn_1021,mrgn_1022,mrgn_1023,category,chain_length
123,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,Aromatic PFASs,8


In [54]:
catpred = CategoryPrediction(rev_dict, rf_model )



In [55]:
catpred.make_prediction(df)

"('Aromatic PFASs', 'gte7', nan, nan)"