In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
def LoadDataframeFomCSV(path):
    try:
        df = pd.read_csv(path)
    except:
        return None, "Can't load csv file from: " + path
    try:
        base_df = df[["SMILES", "pIC50"]]
        return base_df, "Dataframe loaded successfully"
    except:
        return None, "Given csv file does not contain nessesary columns: SMILES and pIC50. Check your data for spelling mistakes."

In [3]:
base_df, err = LoadDataframeFomCSV("wrog/path.csv")
err

"Can't load csv file from: wrog/path.csv"

In [4]:
base_df, err = LoadDataframeFomCSV("data/processed/smiles_aggregated.csv")
err

'Given csv file does not contain nessesary columns: SMILES and pIC50. Check your data for spelling mistakes.'

In [5]:
base_df, err = LoadDataframeFomCSV("data/processed/smiles_aggregated_correct_columns.csv")
err

'Dataframe loaded successfully'

In [6]:
def Validate(df):
    return df['SMILES'].map(lambda x: Chem.MolFromSmiles(x) != None)

def ToCanonicalSmiles(df):
    df['SMILES'] = df['SMILES'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    return df

In [7]:
def ValidateSmiles(base_df):
    valid = Validate(base_df)
    invalid_rows = valid.index[valid == False].tolist()
    base_df = base_df[valid]
    ToCanonicalSmiles(base_df)
    duplicate_rows = base_df.index[base_df.duplicated('SMILES')].tolist()
    base_df = base_df.drop_duplicates('SMILES')
    na_rows = base_df.index[base_df['pIC50'].isna()].tolist()
    base_df = base_df.dropna()
    return base_df, [invalid_rows, duplicate_rows, na_rows]

In [8]:
# invalidate 2 molecules and add na for testing
base_df.loc[0,'SMILES'] = 'Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1'
base_df.loc[4,'SMILES'] = 'Cc1ccccc1-c1ccc2nc(N)c@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1'
base_df.loc[44,'pIC50'] = None
ValidateSmiles(base_df)

[10:39:54] SMILES Parse Error: syntax error while parsing: Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1
[10:39:54] SMILES Parse Error: Failed parsing SMILES 'Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1' for input: 'Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1'
[10:39:54] SMILES Parse Error: syntax error while parsing: Cc1ccccc1-c1ccc2nc(N)c@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1
[10:39:54] SMILES Parse Error: Failed parsing SMILES 'Cc1ccccc1-c1ccc2nc(N)c@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1' for input: 'Cc1ccccc1-c1ccc2nc(N)c@@H](C)C(=O)N[C@@HC(C)(C)C3)cc2c1'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SMILES'] = df['SMILES'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


(                                                 SMILES     pIC50
 1     CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...  8.853872
 2     CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...  8.698970
 3     CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...  8.698970
 5     CC(C)(C)c1cccc(C[NH2+][C@H]2CS(=O)C[C@@H](Cc3c...  8.698970
 6     CCCCCS(=O)(=O)C[C@@H](NC(=O)c1cccnc1)C(=O)N[C@...  8.698970
 ...                                                 ...       ...
 1514          CCC(CC(=O)NCC1CCOCC1)n1c(N)nc2cc(Cl)ccc21  3.000000
 1515          CCC(CC(=O)NCc1ccccn1)n1c(N)nc2cc(Cl)ccc21  3.000000
 1516                  Cn1c(N)nc(C2CC2c2cccc(Br)c2)cc1=O  2.953115
 1517           Cn1c(N)nc(C2CC2c2cccc(-c3ccccc3)c2)cc1=O  2.733298
 1518                  Nc1nc2cc(Cl)ccc2n1CCCC(=O)NCC1CC1  2.544546
 
 [1507 rows x 2 columns],
 [[0, 4], [95, 246, 328, 419, 469, 669, 807, 1318, 1485], [44]])