# Cleaning DataSet

In [1]:
import pandas as pd
import re
import numpy as np

## Text File to DataFrame

In [2]:
pattern = r"(\S*)\tDRUGSMIL\t(\S*)" #r"DRUGSMIL\t(/w+)"
text_to_search = 'D00AAN\tDRUGSMIL\tC1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)CC5)O)C6=NC=CC7=C6NC8=C7C=CC=C8OS(=O)(=O)C9=CC=C(C=C9)Cl'
re.findall(pattern, text_to_search)

[('D00AAN',
  'C1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)CC5)O)C6=NC=CC7=C6NC8=C7C=CC=C8OS(=O)(=O)C9=CC=C(C=C9)Cl')]

In [3]:
filepath = "../lewagon-ddi/raw_data/ttd_drug_ids.txt"
with open(filepath, encoding="utf-8") as f:
    drug_str = f.read()

In [4]:
drug_list = drug_str.split("_______________________________________________________________________")
len(drug_list)

3

In [5]:
actual_data = drug_list[2]

In [6]:
drug_dict = {
    "DRUG__ID" :[],
    "DRUGSMIL":[]
}

In [7]:
data2 = actual_data.split('\n')
data3 = list(filter(lambda a: a != '\t\t', data2))
data4 = list(filter(lambda a: a != '_\t\t', data3))

In [8]:
for i in data4:
    pattern = r"(\S*)\tDRUGSMIL\t(\S*)"
    drugid = re.findall(pattern, i)
    if drugid !=[]:
        drug_id = drugid[0][0]
        drug_dict["DRUG__ID"].append(drug_id)
        drug_mil = drugid[0][1]
        drug_dict["DRUGSMIL"].append(drug_mil)

In [9]:
len(drug_dict["DRUG__ID"])

21083

In [10]:
len(drug_dict["DRUGSMIL"])

21083

In [11]:
id_df = pd.DataFrame(drug_dict)

In [12]:
id_df.columns = id_df.columns.str.replace('DRUG__ID', 'DrugID')

## TDD CSV to DataFrame

In [13]:
df = pd.read_excel("../lewagon-ddi/raw_data/ttd_database.xlsx", index_col=0)

In [14]:
df.reset_index(drop=False, inplace=True)

In [15]:
tdd_df = df.drop(columns = 'Highest_status')
tdd_df.head()

Unnamed: 0,TargetID,DrugID,MOA
0,T71390,D07OAC,Inhibitor
1,T70309,D07OAC,Inhibitor
2,T97071,D0Y6UB,Inhibitor
3,T86679,D0J2UW,Agonist
4,T19229,D0UZ9U,Modulator


In [16]:
anta_list = ['Inhibitor', 'Antagonist', 'Blocker', 'Inhibitor (gating inhibitor)', 
             'Blocker (channel blocker)', 'Disrupter', 'Suppressor', 'Inactivator', 'Inverse agonist',
             'Inhibitor; Antagonist; Blocker', 'antagonist', 'Antagonist (gating inhibitor)','Antagonist (channel blocker)',
            'Antagonist; Antagonist; Antagonist','Agonis; Antagonist','Agonis; Inverse agonist']
ago_list = ['Agonist', 'Activator', 'Stimulator', 'Immunostimulant', 'Enhancer', 'Inducer', 'Regulator (upregulator)', 
            'Cofactor', 'Partial agonist', 'Co-agonist', 'Stimulator ','Agonist ','Modulator (Agonist)']
mod_list = ['Modulator', 'Binder', 'Binder (minor groove binder)', 'Immunomodulator', 'Modulator (allosteric modulator)',
            'Immunomodulator (Immunostimulant)','Regulator', 'Immunomodulator ','Modulator (minor groove binder)',
           'Modulator (upregulator)','Modulator ']
cart_list = ['CAR-T-Cell-Therapy', 'CAR-T-Cell-Therapy(Dual specific)','CART(Dual specific)']
drop_list = ['Chelator', 'Reactivator', 'Intercalator', 'Antisense', 'Immune response agent', 'Stabilizer','Stablizer',
             'Opener', 'Breaker', 'Degrader', 'Replacement', 'Antisense ','.']

In [17]:
tdd_df = tdd_df[tdd_df['MOA'].isin(drop_list) == False]
tdd_df['MOA'] = tdd_df['MOA'].replace(anta_list, 'Antagonist')
tdd_df['MOA'] = tdd_df['MOA'].replace(ago_list, 'Agonist')
tdd_df['MOA'] = tdd_df['MOA'].replace(mod_list, 'Modulator')
tdd_df['MOA'] = tdd_df['MOA'].replace(cart_list, 'CART')

In [18]:
tdd_df['MOA'].unique()

array(['Antagonist', 'Agonist', 'Modulator', 'CART', 'Ligand'],
      dtype=object)

In [19]:
tdd_df.nunique()

TargetID     2890
DrugID      30276
MOA             5
dtype: int64

In [20]:
tdd_df['MOA'].value_counts()

Antagonist    33501
Modulator      5386
Agonist        3108
CART            427
Ligand          208
Name: MOA, dtype: int64

## Concat 2 DrugID, and replace SMILE

In [21]:
drug_df = pd.merge(tdd_df, id_df, on='DrugID', how='right')

In [22]:
drug_df

Unnamed: 0,TargetID,DrugID,MOA,DRUGSMIL
0,T70977,D00AAN,Antagonist,C1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)...
1,T80896,D00AAU,Antagonist,CCC(C1=CC(=CC=C1)O)C(CC)C2=CC(=CC=C2)O
2,T89534,D00AAU,Antagonist,CCC(C1=CC(=CC=C1)O)C(CC)C2=CC(=CC=C2)O
3,,D00ABO,,C1CN(CCN1)C(=O)C2=CC=C(C=C2)C=CC3=NNC4=CC=CC=C43
4,T58716,D00ACC,Antagonist,C1CC2CCC(C3=CC=CC(=C23)C1)N4CCC5(CC4)C(=O)NCN5...
...,...,...,...,...
30364,T26623,D0SY2M,Antagonist,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3)O)O
30365,T13260,D0SY6C,Antagonist,C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C=C3)O)O
30366,T40954,D0SY6J,Antagonist,CC1=C(SC(=N1)C2=CC(=C3C(=C2)C=CN3C)[N+](=O)[O-...
30367,T28330,D0SY6T,Antagonist,CC1=CC(=CC=C1)NC(=O)NCC=C(C2=CC=CC=C2)C3=CC=CC=C3


In [23]:
drug_df['DRUGSMIL'].isnull().sum()

0

In [24]:
drug_df['TargetID'].isnull().sum()

1972

In [25]:
drug_df = drug_df.dropna()

In [26]:
drug_df

Unnamed: 0,TargetID,DrugID,MOA,DRUGSMIL
0,T70977,D00AAN,Antagonist,C1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)...
1,T80896,D00AAU,Antagonist,CCC(C1=CC(=CC=C1)O)C(CC)C2=CC(=CC=C2)O
2,T89534,D00AAU,Antagonist,CCC(C1=CC(=CC=C1)O)C(CC)C2=CC(=CC=C2)O
4,T58716,D00ACC,Antagonist,C1CC2CCC(C3=CC=CC(=C23)C1)N4CCC5(CC4)C(=O)NCN5...
5,T52921,D00ACC,Agonist,C1CC2CCC(C3=CC=CC(=C23)C1)N4CCC5(CC4)C(=O)NCN5...
...,...,...,...,...
30364,T26623,D0SY2M,Antagonist,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3)O)O
30365,T13260,D0SY6C,Antagonist,C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C=C3)O)O
30366,T40954,D0SY6J,Antagonist,CC1=C(SC(=N1)C2=CC(=C3C(=C2)C=CN3C)[N+](=O)[O-...
30367,T28330,D0SY6T,Antagonist,CC1=CC(=CC=C1)NC(=O)NCC=C(C2=CC=CC=C2)C3=CC=CC=C3


In [27]:
drug_df['DRUGSMIL'].nunique()

18510

## TWOSIDE DataSet

In [28]:
twoside_df = pd.read_csv("../lewagon-ddi/raw_data/twosides.csv", index_col=0)

In [29]:
twoside_df.head()

Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2,Y
0,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,767
1,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,25
2,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,85
3,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,735
4,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,959


In [30]:
drug1_unique = twoside_df['Drug1'].unique()

In [31]:
drug2_unique = twoside_df['Drug2'].unique()

In [32]:
drug_unique = np.concatenate((drug1_unique, drug2_unique))

In [33]:
drug_unique = sorted(set(drug_unique))

In [34]:
drug_df2 = drug_df[drug_df['DRUGSMIL'].isin(drug_unique)]

In [35]:
len(drug_df2['DRUGSMIL'].unique())

452

In [36]:
len(drug_df2['DrugID'].unique())

517

In [37]:
len(drug_df2['TargetID'].unique())

313

In [38]:
drug_df2

Unnamed: 0,TargetID,DrugID,MOA,DRUGSMIL
23,T23459,D00AOJ,Modulator,CCCCCCCCCCCCCCCCCCCCCCO
51,T73726,D00AXJ,Modulator,C1=CC(=C(C=C1Cl)Cl)COC(CN2C=CN=C2)C3=C(C=C(C=C...
57,T18950,D00BCG,Agonist,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...
59,T09423,D00BCP,Antagonist,CCCNC(=O)NS(=O)(=O)C1=CC=C(C=C1)Cl
195,T11822,D00DKK,Agonist,CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C
...,...,...,...,...
30251,T27602,D0SH3I,Antagonist,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...
30252,T69707,D0SH3I,Modulator,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...
30327,T52522,D0SS4P,Agonist,CC(C)(C)NCC(C1=NC(=C(C=C1)O)CO)O
30352,T19229,D0SV8E,Modulator,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...


In [39]:
drug_df3 = drug_df2.pivot(columns=['TargetID'], values = 'MOA')

In [40]:
drug_df4 = drug_df2.drop(columns = ['TargetID','MOA'])

In [41]:
drug_df5 = pd.concat([drug_df4,drug_df3], axis=1)

In [42]:
drug_df3

TargetID,T00145,T00158,T00216,T00884,T01318,T01777,T02532,T02551,T02653,T02777,...,T98933,T99009,T99204,T99455,T99524,T99616,T99685,T99799,T99840,T99954
23,,,,,,,,,,,...,,,,,,,,,,
51,,,,,,,,,,,...,,,,,,,,,,
57,,,,,,,,,,,...,,,,,,,,,,
59,,,,,,,,,,,...,,,,,,,,,,
195,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30251,,,,,,,,,,,...,,,,,,,,,,
30252,,,,,,,,,,,...,,,,,,,,,,
30327,,,,,,,,,,,...,,,,,,,,,,
30352,,,,,,,,,,,...,,,,,,,,,,


In [43]:
drug_df6 = drug_df5.fillna('')
drug_df6 = drug_df6.groupby('DRUGSMIL').agg(lambda x: ''.join(x.unique()))

In [44]:
drug_df6.reset_index(drop=False, inplace=True)

In [45]:
drug_df7 = drug_df6.drop(columns = 'DrugID')

In [46]:
drug_df7

Unnamed: 0,DRUGSMIL,T00145,T00158,T00216,T00884,T01318,T01777,T02532,T02551,T02653,...,T98933,T99009,T99204,T99455,T99524,T99616,T99685,T99799,T99840,T99954
0,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,,,,,,,,,,...,,,,,,,,,,
1,C#CCNC1CCC2=CC=CC=C12,,,,,,,,,,...,,,,,,,,,,
2,C(=O)(N)NO,,,,,,,,,,...,,,,,,,,,,
3,C(=O)(O)[O-].[Na+],,,,,,,,,,...,,,,,,,,,,
4,C(C(C(C(C(CO)O)O)O)O)O,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,[Cl-].[K+],,,,,,,,,,...,,,,,,,,,,
448,[N]=O,,,,,,,,,,...,,,,,,,,,,
449,[Na+].[Cl-],,,,,,,,,,...,,,,,,,,,,
450,[O-2].[O-2].[O-2].[As+3].[As+3],,,,,,,,,,...,,,,,,,,,,
