In [1]:
import warnings
import time

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.preprocessing import minmax_scale

from sklearn.cluster import AffinityPropagation

from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabaz_score
from sklearn.metrics import davies_bouldin_score

pd.set_option('precision', 20)
warnings.simplefilter('ignore')

In [2]:
file = open('./health.txt', 'r', encoding = "utf8",)

lines = file.readlines()
lines.pop(0)

file.close()

In [3]:
dt = pd.read_csv('./word2vec.csv', header = None)
dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.0135948248207569,-0.068771593272686,-0.0131260938942432,0.0060244412161409,0.0379218868911266,-0.0208724178373813,-0.0103245936334133,0.0080185951665043,0.008063780143857,0.0113811697810888,...,-0.0209649987518787,0.062472891062498,-5.98852820985e-05,-0.0626214668154716,-0.0195863768458366,0.0460067316889762,0.0038109917659312,0.011133030988276,-0.0402083806693553,-0.0419677905738353
1,0.0250091981142759,-0.0255860704928636,0.0122374715283513,0.0205671042203903,0.0145229594781994,-0.0187047235667705,-0.0583946853876113,0.0280539095401763,-0.0323618501424789,0.0205821022391319,...,-0.0349767245352268,0.0304748844355344,0.0388990752398967,0.0169257745146751,-0.0006659875507466,0.0073109208606183,0.001348104327917,0.0084143327549099,-0.0139268636703491,0.0023993947543203
2,0.0315438583493232,-0.0258821956813335,-0.0606167055666446,0.0359916426241397,0.037742294371128,-0.0211320202797651,-0.0410231985151767,0.011076271533966,0.0032471425365656,0.0144266700372099,...,-0.0001459375052945,0.0002925574663095,0.0293335225433111,-0.005734703503549,-0.0256829299032688,0.0286939274519681,0.0286918338388204,0.0490430071949958,-0.0112496055662632,-0.0294231493026018
3,0.0587140209972858,-0.0802512541413307,-0.0261009987443685,0.0604606568813324,0.0167144946753978,-0.0052052671089768,-0.0374407581984996,0.0514717474579811,-0.0635047405958175,0.0032471478916704,...,-0.0218793526291847,0.0171033106744289,0.0164721198379993,-0.0049530612304806,-0.0015206087846308,-0.0188216306269168,0.0202157106250524,0.0257079694420099,-0.0165431164205074,0.0129728140309452
4,0.0233661979436874,-0.0656670257449149,-0.0156006179749965,0.0226019974797964,0.030365414917469,-0.0605902820825576,-0.071055382490158,-0.0086413025856018,-0.0187280010432004,0.0428804829716682,...,0.014219170436263,0.0516133457422256,0.0273156724870204,-0.030006006360054,-0.0282086934894323,0.0642558857798576,-0.019291017204523,0.0037249843589961,-0.0499372296035289,-0.0159748680889606


In [4]:
data = normalize(dt.values, norm = 'l2')
#data = minmax_scale(dt.values)
#data = dt.values

In [5]:
start = time.time()

clustering = AffinityPropagation(max_iter = 300)

clustering.fit(data)

labels = clustering.labels_
cluster_centers = clustering.cluster_centers_
cluster_centers_indices = clustering.cluster_centers_indices_ 

finish = time.time()

calculated_time = ((finish - start) / 60)

print("Process Time in Minutes: " + str(calculated_time))

Process Time in Minutes: 26.2623890042305


In [7]:
start = time.time()

coefficient = calinski_harabaz_score(data, labels)

print("Coeficiente de Calinski " + str(coefficient))

finish = time.time()

calculated_time = (finish - start) / 60

print("Time in Minutes: " + str(calculated_time))

Coeficiente de Calinski 6.19436440022664
Time in Minutes: 0.005017860730489095


In [8]:
start = time.time()

coefficient = davies_bouldin_score(data, labels)

print("Coeficiente de Davies " + str(coefficient))

finish = time.time()

calculated_time = (finish - start) / 60

print("Time in Minutes: " + str(calculated_time))

Coeficiente de Davies 2.451856483638026
Time in Minutes: 0.014574122428894044


In [9]:
start = time.time()

coefficient = silhouette_score(data, labels)

print("Coeficiente de Silhueta " + str(coefficient))

finish = time.time()

calculated_time = (finish - start) / 60

print("Time in Minutes: " + str(calculated_time))

Coeficiente de Silhueta 0.02598913275540169
Time in Minutes: 0.09347955385843913


In [17]:
cluster_centers.shape

(1139, 128)

In [10]:
total_cluster = []

for label in np.unique(labels):
    
    total = len(dt.values[labels == label])
    
    total_cluster.append([label, total])

total_cluster

[[0, 12],
 [1, 16],
 [2, 7],
 [3, 4],
 [4, 19],
 [5, 41],
 [6, 47],
 [7, 5],
 [8, 16],
 [9, 4],
 [10, 5],
 [11, 25],
 [12, 6],
 [13, 36],
 [14, 24],
 [15, 29],
 [16, 35],
 [17, 28],
 [18, 38],
 [19, 56],
 [20, 21],
 [21, 20],
 [22, 30],
 [23, 26],
 [24, 3],
 [25, 22],
 [26, 41],
 [27, 8],
 [28, 37],
 [29, 3],
 [30, 1],
 [31, 27],
 [32, 22],
 [33, 6],
 [34, 35],
 [35, 88],
 [36, 29],
 [37, 64],
 [38, 34],
 [39, 21],
 [40, 5],
 [41, 37],
 [42, 6],
 [43, 28],
 [44, 60],
 [45, 8],
 [46, 29],
 [47, 4],
 [48, 16],
 [49, 40],
 [50, 20],
 [51, 9],
 [52, 19],
 [53, 5],
 [54, 22],
 [55, 20],
 [56, 44],
 [57, 34],
 [58, 56],
 [59, 11],
 [60, 31],
 [61, 32],
 [62, 14],
 [63, 12],
 [64, 65],
 [65, 3],
 [66, 30],
 [67, 29],
 [68, 37],
 [69, 5],
 [70, 23],
 [71, 22],
 [72, 8],
 [73, 3],
 [74, 27],
 [75, 34],
 [76, 32],
 [77, 1],
 [78, 25],
 [79, 3],
 [80, 41],
 [81, 3],
 [82, 11],
 [83, 24],
 [84, 69],
 [85, 36],
 [86, 54],
 [87, 11],
 [88, 17],
 [89, 28],
 [90, 22],
 [91, 23],
 [92, 5],
 [93, 30],
 

In [18]:
cluster = 1
show = 30


lines = np.array(lines)

current_data = lines[labels == cluster]

for current_datum in current_data[0 : show]:
    
    datum = current_datum.split("|")[2]
    
    print(datum)

CDC: Misuse of garments may have led to release of bioterror bacteria at Tulane monkey lab.

A Tulane researcher has been exposed to deadly bioterror bacteria at a monkey lab.

Tobacco plant may hold key to #Ebola experimental drugs

USDA: Recalled beef may have reached 35 states

23andMe raises questions about genetic testing

CDC: 'Nightmare bacteria' spreading

Louisiana germ release likely due to lax use of lab garments 

Inspector testing positive for deadly bacteria didn’t get it at Louisiana lab: CDC  

Stability tools keep bodyweight #fitness craze off balance

#Measles may become ‘endemic’ without vaccination, proper clinical diagnosis, CDC official says

#Legionnaire's disease bacteria found at #Bronx housing complex

Energy drinks plus alcohol may encourage drunk driving

Deadly pig virus jumps to Hawaii, animal feed tested

Discovery of largest genetic fat map

Suicide alert 'may have backfired'

Holy sites 'may offer clues to antibiotic resistance'

