# Drug to Drug Interaction - Control File Processor

> Use this file to create a control files from the data source

Control files are used to test the models.

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('./data/Neuron_input.csv')

In [None]:
# select 5 rows from the dataframe with unique Y values and keep the first row of each group
df_group = df.groupby('Y', as_index=False).first().head(500)
# print(df.head())

# load all the valid drug names from a tab delimited file
df_drugs = pd.read_csv('./data/Approved_drug_Information.txt', sep='\t', header=None)

# join df (Drug1_ID, Drug2_ID) with df_drugs (0)  to get the drug names 
df_drugs_names = df_drugs[[0, 1]]
df_drugs_names.columns = ['drug_id', 'name']

# join two dataframes df (Drug1_ID, Drug2_ID) with df_drugs_names drug_id  to get the drug name
df_drugs_names = df_drugs_names.set_index('drug_id')

# join the drug names with the drug1_id and drug2_id and rename the columns to avoid conflicts 
df_join = df_group.join(df_drugs_names, on='Drug1_ID', rsuffix='_1')
df_join = df_join.join(df_drugs_names, on='Drug2_ID', rsuffix='_2')
df_join.rename(columns={'name': 'name_1'}, inplace=True)
print(df_join.columns)


In [None]:

# remove all the rows with null and None values
df_cleaned = df_join.dropna()
print(df_cleaned.head())

# select all rows with valid name and name_2 
df_cleaned = df_cleaned.loc[(df_cleaned['name_1'].notnull()) & (df_cleaned['name_2'].notnull())]
print(df_cleaned.head())


In [None]:
df_unique_set = df_cleaned.drop_duplicates()

# set all columns to lowercase and replace spaces with underscore
df_unique_set.columns = map(str.lower, df_unique_set.columns)

#rename y column to ddi_type
df_unique_set.rename(columns={'y': 'ddi_type'}, inplace=True)

print(df_unique_set.head(10))

# get a set of test cases using only name_1, drug1, drug2 and name_2
df_test_cases = df_unique_set[['name_1', 'drug1', 'drug2', 'name_2']]
df_test_cases.columns = ['drug1', 'smiles_1', 'drug2', 'smiles_2']

#save to csv
df_test_cases.to_csv('./data/test_cases_complete.csv', index=False)


In [None]:
# calculate the ssp for each row and add a new column to the dataframe

# df_unique_set['ssp'] = df_unique_set.copy().apply(lambda row: predictor.calculate_ssp(row['drug1'], row['drug2']), axis=1)
# add the ssp column to the dataframe

# print(df_unique_set.head(10))

# select all the rows with ssp > 0
df_ssp = df_unique_set.loc[df_unique_set['ssp'] > 0]
print(df_ssp.head(10))

# save this file to a csv file
df_ssp.to_csv('./data/control_features.csv', index=False)
