# Cluster Analysis

Import the librairies

In [137]:
import pandas as pd
import numpy as np
import pickle
from IPython.display import HTML

Import Training Data

In [11]:
train = pd.read_csv('Dataframe without picks.csv')
train = train[['ID', 'Outcome']]                     # to have only the columns ID and Outcome in the dataframe train 
train.columns = ['Molecule', 'Blocker']              # to rename the column : Outcome -> Blocker and ID -> Molecule
train.head(5)                                            

Unnamed: 0,Molecule,Blocker
0,0,0
1,1,0
2,3,1
3,7,0
4,8,1


Import the index of the ouput calculation of Tanimoto Pairwise 

In [102]:
with open('lg-42040-outputs-retained_poses.pkl','rb') as f:
    poses = pickle.load(f)

Import the results of the Hierarchical Ward Clustering 

In [106]:
ward90 = list(np.load('lg-42170-outputs-ward_cluster_0.9.npy'))
ward85 = list(np.load('lg-42169-outputs-ward_cluster_0.85.npy'))
ward80 = list(np.load('lg-42072-outputs-ward_cluster_0.8.npy'))
ward75 = list(np.load('lg-42071-outputs-ward_cluster_0.75.npy'))
ward70 = list(np.load('lg-42042-outputs-ward_cluster_0.7.npy'))
ward65 = list(np.load('lg-42044-outputs-ward_cluster_0.65.npy'))
ward60 = list(np.load('lg-42043-outputs-ward_cluster_0.6.npy'))
ward55 = list(np.load('lg-42044-outputs-ward_cluster_0.65.npy'))

Create a Dataframe containing the correlation between the poses and the clusters at different thresholds 

In [107]:
df = pd.DataFrame({'Pose':poses, 'Ward55':ward55, 'Ward60':ward60, 'Ward65':ward65, 'Ward70':ward70, 'Ward75':ward75, 'Ward80':ward80, 'Ward85':ward85, 'Ward90':ward90})

Add to the Dataframe the corresponding molecule to each pose

In [108]:
df['Molecule'] = [int(i.split("_")[2]) for i in df.Pose] # to look for the molecule name in the pose name by splitting after the second "_"
df.head(5)

Unnamed: 0,Pose,Ward55,Ward60,Ward65,Ward70,Ward75,Ward80,Ward85,Ward90,Molecule
0,receptor_noK_1001_0_Run_1.pdb,18117,18846,18117,16303,13389,10372,8042,6311,1001
1,receptor_noK_1001_0_Run_10.pdb,17940,18655,17940,16141,13262,10277,7964,6248,1001
2,receptor_noK_1001_0_Run_11.pdb,17936,18651,17936,16137,13259,10275,7962,6246,1001
3,receptor_noK_1001_0_Run_12.pdb,17936,18651,17936,16137,13259,10275,7962,6246,1001
4,receptor_noK_1001_0_Run_13.pdb,14340,14863,14340,12962,10658,8244,6402,5028,1001


Merge the Training Dataframe with Dataframe containing the poses and clusters

In [109]:
df_annot = df.merge(train, on='Molecule', how='left')  # to merge the 'train' dataframe at the column 'Molecule' to the left dataframe (= in the code line 'df_annot') 
df_annot.head(5)

Unnamed: 0,Pose,Ward55,Ward60,Ward65,Ward70,Ward75,Ward80,Ward85,Ward90,Molecule,Blocker
0,receptor_noK_1001_0_Run_1.pdb,18117,18846,18117,16303,13389,10372,8042,6311,1001,1
1,receptor_noK_1001_0_Run_10.pdb,17940,18655,17940,16141,13262,10277,7964,6248,1001,1
2,receptor_noK_1001_0_Run_11.pdb,17936,18651,17936,16137,13259,10275,7962,6246,1001,1
3,receptor_noK_1001_0_Run_12.pdb,17936,18651,17936,16137,13259,10275,7962,6246,1001,1
4,receptor_noK_1001_0_Run_13.pdb,14340,14863,14340,12962,10658,8244,6402,5028,1001,1


Final Dataframe : Correlation of Molecule / Class / Cluster

In [422]:
Cluster = 'Ward90' # to adapt to another cluster -> 'Wardthresholdnumber' 
sub_df = df_annot[['Molecule', 'Pose', 'Blocker', Cluster]]
sub_df.head(5)

Unnamed: 0,Molecule,Pose,Blocker,Ward90
0,1001,receptor_noK_1001_0_Run_1.pdb,1,6311
1,1001,receptor_noK_1001_0_Run_10.pdb,1,6248
2,1001,receptor_noK_1001_0_Run_11.pdb,1,6246
3,1001,receptor_noK_1001_0_Run_12.pdb,1,6246
4,1001,receptor_noK_1001_0_Run_13.pdb,1,5028


Sort the molecules per cluster 

In [423]:
sorted = sub_df.reset_index()[['Molecule', 'Pose', Cluster]].drop_duplicates().groupby([Cluster, 'Blocker']).count().sort_values('Molecule') # to obtain the number of molecules per cluster
sorted.tail(10) # to print the biger clusters

Unnamed: 0_level_0,Molecule,Pose
Ward90,Unnamed: 1_level_1,Unnamed: 2_level_1
10816,30,30
8141,30,30
5464,32,32
7806,32,32
9371,32,32
8146,34,34
11120,36,36
8123,38,38
5902,38,38
6690,41,41


Obtain the molecules inside a cluster

In [424]:
list_molecules = sub_df.query('Ward90 == 6908') # to obtain the molecules belonging in a cluster
final_list = list_molecules.drop_duplicates()   # to drop the duplicates
final_list

Unnamed: 0,Molecule,Pose,Blocker,Ward90
27226,2917,receptor_noK_2917_0_Run_15.pdb,1,6908
44722,4152,receptor_noK_4152_0_Run_17.pdb,0,6908
45153,4187,receptor_noK_4187_0_Run_8.pdb,1,6908
45406,4201,receptor_noK_4201_0_Run_1.pdb,1,6908
45897,423,receptor_noK_423_0_Run_7.pdb,0,6908
46082,4255,receptor_noK_4255_0_Run_1.pdb,0,6908
47420,4341,receptor_noK_4341_0_Run_5.pdb,0,6908
47428,4342,receptor_noK_4342_0_Run_13.pdb,1,6908
48382,4413,receptor_noK_4413_0_Run_3.pdb,1,6908
51476,4622,receptor_noK_4622_0_Run_13.pdb,1,6908


In [428]:
df_ward_clustering = final_list[['Ward90','Molecule','Pose','Blocker']]
df_ward_clustering['Ward'] = Cluster
final = df_ward_clustering.iloc[:, [4,0,1,2,3]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ward_clustering['Ward'] = Cluster


Display without index
HTML(final.to_html(index=False))