In [16]:
import pandas as pd 
from rdkit import Chem
import numpy as np

# smiles + IC50 데이터 불러오기
# 타겟 물질 : hERG, HepG2, CYP2C9, CYP2D6, CYP3A4

cols = ['Smiles','Standard Type', 'Standard Relation', 'Standard Value',
        'Standard Units','Data Validity Comment','Molecular Weight']

herg = pd.read_csv('hERG_IC50.csv',sep=';')[cols]
hepg2 = pd.read_csv('HepG2_IC50.csv',sep=';')[cols]
cyp2c9 = pd.read_csv('CYP2C9_IC50.csv',sep=';')[cols]
cyp2d6 = pd.read_csv('CYP2D6_IC50.csv',sep=';')[cols]
cyp3a4 = pd.read_csv('CYP3A4_IC50.csv',sep=';')[cols]

In [17]:
def preprocessing_data(df, target):
	df = df[~df['Smiles'].isnull()] # smiles가 없는 행은 전부 제거
	df['canon_smiles'] = [Chem.CanonSmiles(smiles) for smiles in df['Smiles']]
	df = df[df['Data Validity Comment'].isna()]

	# 단위가 다른 것을 통일 (ug.mL-1을 nM 단위로 바꾸기)
	for idx in df[df['Standard Units']=='ug.mL-1'].index:
		df.loc[idx,'Standard Value'] = 10**(6)*df.loc[idx]['Standard Value'] / df.loc[idx]['Molecular Weight'].astype('float')
		df.loc[idx,'Standard Units'] = 'nM'

	df = df[df['Standard Units'] == 'nM']
	df[f'{target}_pIC50'] = -np.log10(df['Standard Value'])

	dup_diff_mean = []
	dup_df = []

	dup_smiles = [smiles for smiles, cnt in df['canon_smiles'].value_counts().items() if cnt > 1]

	for smiles in dup_smiles:
		duplicate_smiles_df = df[df['canon_smiles'] == smiles]
		pIC50_mean = duplicate_smiles_df[f'{target}_pIC50'].mean()
		pIC50_max = duplicate_smiles_df[f'{target}_pIC50'].max()
		pIC50_min = duplicate_smiles_df[f'{target}_pIC50'].min()
		dup_diff_mean.append(max(abs(pIC50_max - pIC50_mean), abs(pIC50_min - pIC50_mean)))
		
		compare_value = round(sum(dup_diff_mean) / len(dup_diff_mean), 1)

		if max(abs(pIC50_max - pIC50_mean), abs(pIC50_min - pIC50_mean)) <= compare_value:
			new_data = list(duplicate_smiles_df.iloc[0])
			new_data[-1] = pIC50_mean
			dup_df.append(new_data)

	dup_df = pd.DataFrame(dup_df, columns=df.columns)

	df = df[~df['canon_smiles'].isin(dup_smiles)]
	df = pd.concat([df, dup_df])
	
	return df[['canon_smiles',f'{target}_pIC50']]

In [18]:
herg = preprocessing_data(herg, 'herg')
hepg2 = preprocessing_data(hepg2, 'hepg2')
cyp2c9 = preprocessing_data(cyp2c9, 'cyp2c9')
cyp2d6 = preprocessing_data(cyp2d6, 'cyp2d6')
cyp3a4 = preprocessing_data(cyp3a4, 'cyp3a4')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['canon_smiles'] = [Chem.CanonSmiles(smiles) for smiles in df['Smiles']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['canon_smiles'] = [Chem.CanonSmiles(smiles) for smiles in df['Smiles']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['canon_smiles'] = [Chem.CanonSmiles(smiles) for smil

In [20]:
from functools import reduce

# 예시: df1, df2, df3, df4, df5
dfs = [herg, hepg2, cyp2c9, cyp2d6, cyp3a4]
key = "canon_smiles"

# 차례로 merge (outer join 사용)
merged_df = reduce(lambda left, right: pd.merge(left, right, on=key, how='outer'), dfs)
merged_df.head()

Unnamed: 0,canon_smiles,herg_pIC50,hepg2_pIC50,cyp2c9_pIC50,cyp2d6_pIC50,cyp3a4_pIC50
0,Br.CCCCOc1ccc2c3ccnc(C)c3n(CC(C)C)c2c1,,-4.081393,,,
1,Br.CCCCOc1ccc2c3ccnc(C)c3n(CCCC)c2c1,,-4.190166,,,
2,Br.CCCCn1c2cc(O)ccc2c2ccnc(C)c21,,-4.910579,,,
3,Br.CCCCn1c2cc(OC(C)C)ccc2c2ccnc(C)c21,,-4.480003,,,
4,Br.CCCCn1c2cc(OCC)ccc2c2ccnc(C)c21,,-4.638244,,,


In [25]:
for i in merged_df.columns:
    print(f'{i}의 결측치 개수는', merged_df[i].isnull().sum())

canon_smiles의 결측치 개수는 0
herg_pIC50의 결측치 개수는 32153
hepg2_pIC50의 결측치 개수는 20549
cyp2c9_pIC50의 결측치 개수는 38736
cyp2d6_pIC50의 결측치 개수는 37985
cyp3a4_pIC50의 결측치 개수는 35366


In [26]:
merged_df.to_csv('need_fill_nullvalue.csv',index=False)