In [1]:
import tensorflow
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import pickle
import os
import random
from collections import defaultdict

In [2]:
np.random.seed(random.randint(0,5000))
PATH = '.'
FINGERPRINTS = 'fingerprints'
fing_path = os.path.join(PATH, FINGERPRINTS)

morg_2048_path = 'morgan_df_2048.p'
morg_1024_path = 'morgan_df_1024.p'
maccs = 'maccs_df.p'

morg_2048_bit = os.path.join(fing_path, morg_2048_path)
morg_1024_bit = os.path.join(fing_path, morg_1024_path)
maccs = os.path.join(fing_path, maccs)

pd.options.display.max_rows = 14
pd.options.display.max_columns = 6

In [3]:
# Lets check a cluster technique!

class ya_boi:

    def __init__(self, x):
        self.data = x
        self.kmeans = None

    def create_clusters(self):
        self.kmeans = KMeans(n_clusters=4, init='random')#verbose=-1)
        self.kmeans.fit(np.array(list(self.data['fingerprints']), dtype='int32'))
        self.data['cluster'] = self.kmeans.labels_



In [4]:
df = pickle.load(open(morg_2048_bit, 'rb'))



In [5]:
x = ya_boi(df[:10000])
del df


In [6]:
# 2048 bit
x.create_clusters()

In [7]:
p = defaultdict(list)

x.data['Solubility'] = x.data['Solubility'].apply(tuple)

clusters =3

# This is to find out how many of each chemical went into each cluster
for item in range(clusters):
    for c, item in enumerate(x.data['cluster']):
        # print(c, item)
        if x.data['Solubility'][c] == (1,0) and x.data['cluster'][c] == item:
            # print(item)
            p[item].append(c)



In [8]:
# Representation of insoluble compounds in each cluster
for item in range(clusters):
    print(item, len(set(p[item])))

0 75
1 199
2 138


In [9]:
# Soluble 
for item in range(clusters):
    print(list(x.data['cluster']).count(item) - len(set(p[item])))

3207
3021
1595


In [10]:
for item in range(clusters):
    print(list(x.data['cluster']).count(item))

3282
3220
1733


In [11]:
# The soluble compounds appear to be overrepresented in group 0 and underrepresented in group 1,
# This is a great sign!

In [12]:
# 1024 bit

df = pickle.load(open(morg_1024_bit, 'rb'))
x = ya_boi(df[:10000])
del df

p = defaultdict(list)
x.data['Solubility'] = x.data['Solubility'].apply(tuple)

x.create_clusters()

clusters =3

# This is to find out how many of each chemical went into each cluster
for item in range(clusters):
    for c, item in enumerate(x.data['cluster']):
        # print(c, item)
        if x.data['Solubility'][c] == (1,0) and x.data['cluster'][c] == item:
            # print(item)
            p[item].append(c)

            

In [13]:
for item in range(clusters):
    print(item, len(set(p[item])))

0 203
1 91
2 68


In [14]:
# Soluble 
for item in range(clusters):
    print(list(x.data['cluster']).count(item) - len(set(p[item])))

3079
1748
3088


In [15]:
for item in range(clusters):
    print(list(x.data['cluster']).count(item))

3282
1839
3156


In [16]:
# This isn't as good at seperating in groups.

In [17]:
# maccs

df = pickle.load(open(maccs, 'rb'))
x = ya_boi(df[:10000])
del df

p = defaultdict(list)
x.data['Solubility'] = x.data['Solubility'].apply(tuple)

x.create_clusters()

clusters =3

# This is to find out how many of each chemical went into each cluster
for item in range(clusters):
    for c, item in enumerate(x.data['cluster']):
        # print(c, item)
        if x.data['Solubility'][c] == (1,0) and x.data['cluster'][c] == item:
            # print(item)
            p[item].append(c)

In [18]:
for item in range(clusters):
    print(item, len(set(p[item])))

0 159
1 219
2 37


In [19]:
# Soluble 
for item in range(clusters):
    print(list(x.data['cluster']).count(item) - len(set(p[item])))

2829
2963
1038


In [20]:
for item in range(clusters):
    print(list(x.data['cluster']).count(item))

2988
3182
1075
