# This is the first pre-processing file that takes Moonshot data and creates distinct datasets for Acrylamide, Non-Covalent and PLPro series

In [None]:
import pandas as pd
import numpy as np
import pdb

### PLPro Preprocessing

In [None]:
PL_1 = pd.read_csv('PLpro_data_1.csv')[['SMILES_acid', 'PLpro Protease Assay Dose-Response: IC50 (uM) [Target: Plpro, JIRA: 11980, Readout: FI, Iincubation time: 120]']]
PL_2 = pd.read_csv('PLpro_data_2.csv')[['SMILES_acid', 'PLpro Protease Assay Dose-Response: IC50 (uM)']]
PL_1 = PL_1.rename(columns = {'SMILES_acid' : 'SMILES', 'PLpro Protease Assay Dose-Response: IC50 (uM) [Target: Plpro, JIRA: 11980, Readout: FI, Iincubation time: 120]' : 'IC50'})
PL_2 = PL_2.rename(columns = {'SMILES_acid' : 'SMILES', 'PLpro Protease Assay Dose-Response: IC50 (uM)' : 'IC50'})

In [None]:
def IC50(row):
    if row['IC50'] is np.nan:
        return np.inf
    elif '>' in str(row['IC50']):
        return np.inf
    else:
        return float(row['IC50'])

In [None]:
PL_1['IC50'] = [ IC50(row) for index, row in PL_1.iterrows()]
PL_2['IC50'] = [ IC50(row) for index, row in PL_2.iterrows()]

PL_1['activity'], PL_2['activity'] = PL_1.apply(lambda x: x['IC50'] < 90, axis=1), PL_2.apply(lambda x: x['IC50'] < 90, axis=1)

In [None]:
PL_1.to_csv('known_PLPro1_activity.csv', index = False)
PL_2.to_csv('known_PLPro2_activity.csv', index = False)

## Acrylamide / Noncovalent preprocessing

### For noncovalent data with no single inhibition data (used in publication)

In [None]:
df = pd.read_csv('old_noncovalent_activity.smi').rename(columns = {'f_avg_IC50' : 'IC50'})
df['IC50'] = df['IC50'].fillna(np.inf)
bad_smiles = ['JAG-UCB-a3ef7265-20']
df = df[~df['CID'].isin(bad_smiles)]
df.to_csv('known_noncovalent_activity.csv', index = False)

### For covalent data with single inhibition and IC50 values

#### First we create a flag of (-1, 0, 1 ) depending if the compound data is (useless, inactive, active)

In [None]:
def activity_class(row):
    if (row['f_avg_IC50'] != np.inf) & (row['f_inhibition_at_50_uM'] != -np.inf):       
        if row['f_avg_IC50'] < 50:            
            return 1 # IC50 exists and < 50 --> active
        else:
            return 0 # IC50 exists but > 50 --> inactive
    elif (row['f_avg_IC50'] == np.inf) & (row['f_inhibition_at_50_uM'] != -np.inf):
        if row['f_inhibition_at_50_uM'] < 50:
            return 0 # no IC50 with inhibition < 50 --> inactive
        else:
            return -1 # no IC50 but inihibtion > 50 --> ignore
    else:
        return -1 # no IC50 and no inhibition

### Now pull in the master data (both noncovalent and acrylamide) and begin cleaning

In [None]:
master_df = pd.read_csv('known_master_activity.csv')
master_df['f_avg_IC50'], master_df['f_inhibition_at_50_uM'] = master_df['f_avg_IC50'].fillna(np.inf), master_df['f_inhibition_at_50_uM'].fillna(-np.inf)
master_df['activity'] = [ activity_class(row) for index, row in master_df.iterrows()]

## Remove bad SMILES and useless data

In [None]:
bad_smiles = ['ALP-POS-c59291d4-5', 'JAG-UCB-a3ef7265-20', 'LON-WEI-babf2c61-3', 'ALP-POS-c59291d4-1',
             'LON-WEI-ff7b210a-4', 'LON-WEI-ff7b210a-5']

master_df = master_df[~master_df['canonical_CID'].isin(bad_smiles)]
master_df = master_df[master_df['activity'] > -1]

### Extract noncovalent and acrylamide data from master file

In [None]:
noncov = master_df[(master_df['acrylamide'] == False) & (master_df['chloroacetamide'] == False) ][['SMILES', 'canonical_CID', 'activity', 'f_avg_IC50','f_inhibition_at_50_uM']].rename(columns = {'f_avg_IC50' : 'IC50'})
acry = master_df[master_df['acrylamide'] == True][['SMILES', 'canonical_CID', 'activity', 'f_avg_IC50','f_inhibition_at_50_uM']].rename(columns = {'f_avg_IC50' : 'IC50'})

In [None]:
acry.to_csv('known_acrylamide_activity.csv', index = False)
noncov.to_csv('known_noncovalent_activity.csv', index = False)