In [None]:
print(__doc__)

# Code source adapted from: Jaques Grobler
# License: BSD 3 clause

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas
import os
import csv
import time
from cmath import sqrt
from tqdm import tqdm
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import normalize

import pprint

pp = pprint.PrettyPrinter(indent=4)

In [18]:
def graph_add_scatter(x, y, c='black'):
    plt.scatter(x, y, color= c)

def graph_add_line(x, y, c='black'):
    plt.plot(x, y, color=c, linewidth=3)

def plot(name=""):
    plt.xticks()
    plt.yticks()
    
    if name!="":
        plt.savefig(name)
    plt.show()
    plt.close()

def distance(v1, v2):
    dist = 0
    for i in range(len(v1)):
        dist += (v1[i]-v2[i])*(v1[i]-v2[i])
    return sqrt(dist).real

def GetClosest(v, i):
    closest = -1
    for j in range(len(v)):
        if i == j:
            continue
        if closest == -1 or distance(v[closest],v[i]) > distance(v[j],v[i]):
            closest = j
    return closest

def GetGroup(labels, group, offset=0):
    indexes = []
    for i in range(len(labels)):
        if labels[i] == group:
            indexes.append(i+offset)
    return indexes

class DataReader():
    def __init__(self, file_path="health-dataset/health.txt"):
        f = open(file_path)
        self.lines = []
        for i, line in enumerate(f):
            s = line.split("|")
            self.lines.append([str(i) + " " + s[-1], False])
    
    def GetLine(self, line):
        if line < 0 and line >= len(self.lines):
            return str(line) + " Is Not a Valid Line"
        return self.lines[line][0]
    
    def GetLineGroup(self, lines):
        t = []
        for i in lines:
            if not self.lines[i][1]:
                t.append(self.lines[i][0])
        return t
    def DeleteLines(self, d):
        for i in range(len(d)-1,-1,-1):
            print(self.lines[i])
            del self.lines[i]
    def DeleteOutliers(self):
        for i in range(len(self.lines)-1,-1,-1):
            if self.lines[i][1]:
                print(self.lines[i])
                del self.lines[i]
    def SetOutliers(self, o):
        for i in o:
            self.lines[i][1] = True

data_reader = DataReader()

# Read Dataset

In [None]:
# Read and treat training dataset
dataset = pandas.read_csv('health-dataset/word2vec.csv').values
# dataset = (dataset-dataset.max()/2) / dataset.max()
dataset = normalize(dataset)
data_train = dataset[0:10000,:]
data_validation = dataset[10000:13227,:]

# dataset = load_digits(n_class=10)

# K-Means Algorithm

In [None]:
kmeans_costs = []
kmeans_clusters = []
kmeans_silhouette = []
kmeans_davies = []
for n in range(10,100,10):
    print("Training KMeans for " + str(n) + " clusters")
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(data_train)
    
    
    labels = kmeans.labels_
    s = silhouette_score(data_validation,labels)
    d = davies_bouldin_score(data_validation,labels)
    print("\t\t Cost: " + str(kmeans.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans.inertia_)
    kmeans_clusters.append(n)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)
    
for n in range(100,2001,100):
    print("Training KMeans for " + str(n) + " clusters")
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(data_train)
    
    labels = kmeans.labels_
    s = silhouette_score(data_validation,labels)
    d = davies_bouldin_score(data_validation,labels)
    print("\t\t Cost: " + str(kmeans.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans.inertia_)
    kmeans_clusters.append(n)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)

In [None]:
graph_add_line(kmeans_clusters, kmeans_costs)
graph_add_scatter(kmeans_clusters, kmeans_costs,c='blue')
plot("cost_nclusters_10_2000")

graph_add_line(kmeans_clusters, kmeans_silhouette)
graph_add_scatter(kmeans_clusters, kmeans_silhouette,c='blue')
plot("silhouette_nclusters_10_2000")

graph_add_line(kmeans_clusters, kmeans_davies)
graph_add_scatter(kmeans_clusters, kmeans_davies,c='blue')
plot("davies_nclusters_10_2000")

## Getting Clusters

Here we get the tweets using kmeans with 100 and 2000 clusters to compare

In [None]:
kmeans_100 = KMeans(n_clusters=100)
kmeans_100.fit(data_train)

In [None]:
labels = kmeans_100.labels_
s100 = silhouette_score(data_validation,labels)
print("Cost: " + str(kmeans_100.inertia_))
print("Silhouette Score for 100 Clusters is: " + str(s100))
d100 = davies_bouldin_score(data_validation,labels)
print("Davies Bouldin Score for 100 Clusters is: " + str(d100))


for i in range(100):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(kmeans_100.cluster_centers_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i)))

In [None]:
kmeans_1000 = KMeans(n_clusters=1000)
kmeans_1000.fit(data_train)

In [None]:
labels = kmeans_1000.predict(data_validation)
s1000 = silhouette_score(data_validation,labels)
print("Silhouette Score for 1000 Clusters is: " + str(s1000))
d1000 = davies_bouldin_score(data_validation,labels)
print("Davies Bouldin Score for 1000 Clusters is: " + str(d1000))

print(GetClosest(kmeans_1000.cluster_centers_,58))
for i in range(1000):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(db.components_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=10000)))

## Affinity Propagation

In [None]:
from sklearn.cluster import AffinityPropagation

print("Training Affinity")
affinity = AffinityPropagation()
affinity.fit(data_train)

In [None]:
labels = affinity.predict(data_validation)
print("Silhouette Score: " + str(silhouette_score(data_validation,labels)))
print("Davies Bouldin Score: " + str(davies_bouldin_score(data_validation,labels)))

print(affinity.cluster_centers_indices_)
for i in range(len(affinity.cluster_centers_indices_)):
    print("Group " + str(i) + "\n\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i, offset=10000)))

## PCA Analysis

Here we use the best algorithm and cluster number we got from previous experiments and run it again using different number of features, using PCA to reduce dimensionality

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_features = data_train.shape[1]

pca = PCA()

pca.fit(dataset)
variance = pca.explained_variance_ratio_.cumsum()

# Get Dimensionality with variance of 0.95 
fn = 0
for i in range(len(variance)):
    if variance[i] > 0.95:
        fn = i
        break

print("Found feature subset of size " + str(fn+1) + " with variance of " + str(variance[fn]))
pca = PCA(n_components=n_features-fn)
data_pca = pca.fit_transform(dataset)

for i in range(n_features, 0, -1):
    pca = PCA(n_components=i)
    data_pca = pca.fit_transform(dataset)


    # Fit Kmeans algorithm for 0.95 variance
    print("Training DBSCAN with " + str(i) + " features")
    db_pca = DBSCAN()
    db_pca.fit(data_pca)

    labels = db_pca.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print(labels)
    print(n_clusters_)

s = silhouette_score(data_pca,labels)
d = davies_bouldin_score(data_pca,labels)

print("\t\t Data Variance: " + str(pca.explained_variance_ratio_[0]))
print("\t\t Silhouette Score: " + str(s))
print("\t\t Davies Bouldin Score: " + str(d))

In [None]:
for i in range(1000):
    print("\n\nGroup " + str(i) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=10000)))

In [None]:
n_features = data_train.shape[1]

kmeans_costs = []
kmeans_feature_number = []
kmeans_silhouette = []
kmeans_davies = []
for fn in range(1,n_features,10):
    pca = PCA(n_components=fn)
    pca.fit(data_train)

    # Fit Kmeans algorithm for 0.95 variance
    print("Training KMeans for 1000 clusters with " + str(fn) + " features")
    data_train_pca = pca.transform(data_train)
    kmeans_pca = KMeans(n_clusters=1000)
    kmeans_pca.fit(data_train_pca)

    data_validation_pca = pca.transform(data_validation)
    labels = kmeans_pca.predict(data_validation_pca)
    s = silhouette_score(data_validation_pca,labels)
    d = davies_bouldin_score(data_validation_pca,labels)
    
    print("\t\t Data Variance: " + str(pca.explained_variance_ratio_.cumsum()[-1]))
    print("\t\t Cost: " + str(kmeans_pca.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans_pca.inertia_)
    kmeans_feature_number.append(fn)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)
    
graph_add_line(kmeans_feature_number, kmeans_costs)
graph_add_scatter(kmeans_feature_number, kmeans_costs,c='blue')
plot("cost_features_10_2000")

graph_add_line(kmeans_feature_number, kmeans_silhouette)
graph_add_scatter(kmeans_feature_number, kmeans_silhouette,c='blue')
plot("silhouette_features_10_2000")

graph_add_line(kmeans_feature_number, kmeans_davies)
graph_add_scatter(kmeans_feature_number, kmeans_davies,c='blue')
plot("davies_features_10_2000")

# DBSCAN - Detecting Outliers

Since the experimentes were subpar, we try to detect outliers and cluster again using DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

Here, we try different values for the "eps" parameter, which is the maximum distance between two samples for them to be considered neighbours.

We then plot graphs to see the results

In [None]:
n_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
outliers = []
clusters = []
sil = []
biggest_cluster = []
for i in n_eps:
    print("DBSCAN for eps = " + str(i))
    db = DBSCAN(eps=i)
    db.fit(dataset)
    labels = db.labels_
    
    sil.append(silhouette_score(dataset,labels))
    
    outliers.append(len(data_reader.GetLineGroup(GetGroup(labels,-1))))
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    clusters.append(n_clusters)
    
    biggest_cluster.append(max([len(data_reader.GetLineGroup(GetGroup(labels,i))) for i in range(n_clusters)]))


In [None]:
print("------------------------------------------")
print("eps x number of clusters found")
graph_add_line(n_eps, clusters)
graph_add_scatter(n_eps, clusters,c='blue')
plot()

print("------------------------------------------")
print("eps x silhouette score")
graph_add_line(n_eps, sil)
graph_add_scatter(n_eps, sil,c='blue')
plot()

print("------------------------------------------")
print("eps x number of outliers")
graph_add_line(n_eps, outliers)
graph_add_scatter(n_eps, outliers,c='blue')
plot()

print("------------------------------------------")
print("eps x biggest cluster")
graph_add_line(n_eps, biggest_cluster)
graph_add_scatter(n_eps, biggest_cluster,c='blue')
plot()

With the results above, we can see that between eps = 0.5 and eps = 0.8 the number of outliers diminishes and the size of the biggest cluster increases. We think that all the outliers are being grouped together. We can check that below

In [None]:
# Get outliers
db = DBSCAN(eps=0.5)
db.fit(dataset)

labels = db.labels_

outliers = data_reader.GetLineGroup(GetGroup(labels,-1))

# Get Biggest cluster
db = DBSCAN(eps=0.8)
db.fit(dataset)

labels = db.labels_

group_lengths = [len(data_reader.GetLineGroup(GetGroup(labels,i))) for i in range(n_clusters)]
biggest_c = data_reader.GetLineGroup(GetGroup(labels,group_lengths.index(max(group_lengths))))

# Compare if biggest cluster has outliers
common = 0
for d in biggest_c:
    if d in outliers:
        common+=1
        
print("number of outliers = " + str(len(outliers)))
print("Number of elements in common = " + str(common))

With this result, we can see that most of the elements from the outliers are inside a new group. This proves that there are a lot of outliers in out dataset

In [19]:
db = DBSCAN(eps=0.5)
db.fit(dataset)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

Next, we remove all the outliers and fit DBSCAN again for the final results

In [20]:
labels = db.labels_

print("------------------------------------------------------")
print("OUTLIERS\n")
outliers = data_reader.GetLineGroup(GetGroup(labels,-1))
print("There are " + str(len(outliers)) + " outliers in the dataset\n")
pp.pprint(outliers)

data_reader.SetOutliers([i for i,k in enumerate(labels) if k==-1])

# Delete Outliers
new_data = np.delete(dataset, [i for i,k in enumerate(labels) if k==-1], axis=0)
data_reader.DeleteOutliers()

------------------------------------------------------
OUTLIERS

There are 12467 outliers in the dataset

[   '0 An abundance of online info can turn us into e-hypochondriacs. Or, worse, lead us to neglect getting the care we need\n',
    '1 A plant-based diet that incorporates fish may be the key to preventing colorectal cancers:\n',
    "2 It doesn't take much to damage your hearing at a sports bar or nightclub. That's why a billion people are at risk.\n",
    '3 RT @CNN: Forever young? Discover this island\xe2\x80\x99s secrets to longevity on #TheWonderList w/ @BillWeirCNN \n',
    '4 RT @CNN: Is post-traumatic stress disorder in your genes? A simple blood test may one day help tell you\n',
    '5 Maysoon Zayid, a touring standup comic with Cerebral Palsy, has a message to share.\n',
    "6 How women can wipe out Alzheimer's, from @mariashriver. \n",
    '7 RT @CNNOpinion: Women can defeat #Alzheimers, says @mariashriver. #WipeOutAlz challenge will make it happen.\n',
    '8 Is it t

    '1943 Teachers say school is making them sick\n',
    '1944 RT @cindyrodriguez: Hispanics are a high-risk group for depression, substance abuse and anxiety via @cnnhealth\n',
    "1945 Fat-burning supplement pulled from shelves after it's linked to liver failure, acute hepatitis\n",
    '1946 #Latinos struggle to find help for #mental #health issues\n',
    '1947 Health officials are investigating a possible tuberculosis exposure in the NICU at a Nevada hospital.\n',
    "1948 Dental #health crisis could create 'State of Decay'\n",
    "1949 RT @fit2fat2fit: Keep your #exercise routine fun and interesting, if you enjoy your workout you'll be less likely to skip it.  #workout #ti\xe2\x80\xa6\n",
    '1950 Many states are lacking effective strategies to curb prescription drug abuse, a new report finds #health\n',
    "1951 RT @cnnireport: The shocking, agonizing disorder you've never heard of; Learn about trigeminal neuralgia:\n",
    '1952 A candid conversation with her @GMA boss in

    '3394 Hospital workers fired over flu shot, @elizcohencnn reports\n',
    "3395 Doctors: Hillary Clinton's future 'as good as her past'\n",
    "3396 @jesterjules How unAmerican... oh no wait, that's apple pie. Then we suggest checking out our other #getfit tips every day this month!\n",
    '3397 Today\'s #getfit tip: @MichaelPollan says: "If you\'re not hungry enough to eat an apple, you\'re not hungry."\n',
    '3398 Children with food allergies vulnerable to bullies, research finds\n',
    '3399 RT @danmaurer: Just call me Methuselah..."@cnnhealth: New research says being overweight (not obese) may lead to a longer life. \n',
    '3400 RT @ASH_hematology: Clinton\'s #bloodclot "relatively rare," Dr. Mary Cushman, Chair of ASH Subcmte on Quality of Care, tells @CNNHealth: ...\n',
    '3401 New research says being overweight (not obese) may lead to a longer life. Anyone else confused?\n',
    '3402 Five simple things that can keep you healthy in the #NewYear\n',
    "3403 Today's

    '4930 Waiting for Mr. Right may be an evolutionary wrong\n',
    '4931 People with mental disorders risk an early death\n',
    '4932 Georgia boy who had open heart surgery \xe2\x80\x98thriving\xe2\x80\x99 with pacemaker\n',
    '4933 Family walking from #Disneyland to #Disney World to honor sister who died of #cancer\n',
    '4934 Plant-based diet may lower obese children\xe2\x80\x99s risk of heart disease\n',
    '4935 In England, no more #smoking in cars with kids\n',
    '4936 Officials seek students who partied with Oregon man stricken by meningococcal disease\n',
    '4937 Happy words dominate most languages\n',
    "4938 Which doctors should 'own' end-of-life planning?\n",
    '4939 #Alcohol may help elderly women, but not men, live longer\n',
    '4940 Brain stents show big promise for certain stroke patients\n',
    '4941 Teacher #depression may affect child learning\n',
    '4942 Rescued pit bulls fight stigma by guiding people in need\n',
    '4943 Milk #allergy? Beware 

    '6722 10 ways to make sure you rise-and-shine for your morning workout:\n',
    '6723 Can being an alpha female ruin your relationship?\n',
    '6724 The 7 biggest antidepressant problems, solved\n',
    "6725 He's not in the #mood? These 7 surprising #turnoffs may be to blame:\n",
    "6726 6 superfoods you haven't tried:\n",
    '6727 Attention Thin Mint fans! The best and worst Girl Scout cookies for your diet:\n',
    '6728 Every diet out there...and which one is right for you:\n',
    '6729 Women have been told some pretty crazy things regarding their #health over the last century, read them here:\n',
    '6732 The 7 best yoga poses to soothe back pain and feel great all day:\n',
    '6734 On the fence whether you should try P90X? Let us help you decide:\n',
    "6736 Food Is Love! This Valentine's Day join EH &amp; @Eatsmarttbd to learn about foods that love you back! Follow #HealthTalk on 2/14 @ 1 pm ET\n",
    '6738 The best meat-free protein sources:\n',
    '6739 #Vitamin

    '8709 7 ways to spice up your water:\n',
    '8710 Q2: How did you find out how many calories, saturated fat, and sodium were in each dish? #HealthTalk\n',
    '8711 RT @CSPI: A1 Here are some highlights from Xtreme Eating 2013\n',
    "8712 RT @CSPI: A1 And Americans have the right to know exactly what they're eating and just how extreme these dishes can be. #healthtalk\n",
    '8713 RT @ChangeYourScale: @EverydayHealth @CSPI #xtremeeating #HealthTalk Helpful perspective of # of calories in ONE meal out compared with recommended daily intake.\n',
    "8714 RT @AdvocateSH: @EverydayHealth @CSPI Your list really makes me rethink my dinner choices! I can't believe the calorie counts! #XtremeEating #HealthTalk\n",
    "8716 @energybits welcome! so happy you're joining us! #HealthTalk\n",
    '8718 @advocatesh Welcome! Glad you could join us! #HealthTalk\n',
    '8719 Q1: What does @CSPI hope to accomplish with #XtremeEating? #HealthTalk\n',
    "8720 RT @CSPI: I'm Paige Einstein, staf

    '10646 Does wellbeing promote good mental health?\n',
    '10647 Clegg sets mental health wait limits\n',
    '10648 WHO warns of Ebola hospital risks\n',
    '10649 VIDEO: Can the humble radio save lives?\n',
    '10650 Spain monitors Ebola nurse contacts\n',
    '10651 VIDEO: Are post-natal depression signs missed?\n',
    '10652 VIDEO: Ebola: How can you catch it?\n',
    '10653 "Instagram for doctors" takes off\n',
    '10654 Can pregnant women ever drink?\n',
    '10655 How not to catch Ebola\n',
    '10656 No Ebola screening for UK arrivals\n',
    "10657 VIDEO: Ebola: 'Stringent procedures' in UK\n",
    '10658 Nobel winner warns over immigration\n',
    '10659 Spain investigates Ebola infection\n',
    '10660 VIDEO: Womb transplant mother speaks\n',
    "10661 Inside Sierra Leone's Ebola clinics\n",
    '10662 Why people are so worried about NHS\n',
    "10663 Nobel Prize for the brain's GPS\n",
    "10664 VIDEO: 'At risk' urged to get flu jabs\n",
    '10665 Winter flu jab

    "12432 VIDEO: Japan's chubby women\n",
    "12433 Mental health cases 'rise in young'\n",
    '12434 VIDEO: More NHS contracts won by private firms\n',
    '12435 VIDEO: Mental health issues rising - GPs\n',
    "12436 Weightlifting 'cuts diabetes risk'\n",
    '12437 Women 50-plus urged to have smears\n',
    '12438 VIDEO: Bewitching poison: alcohol as medicine\n',
    '12439 A&amp;E locum costs: Your experiences\n',
    '12440 Viewpoints: How to curb obesity\n',
    '12441 Families to foster elderly people\n',
    '12442 Sharp rise in A&amp;E locum doctor spend\n',
    '12443 UK cancer diagnoses top 330,000\n',
    '12444 UK cancer diagnoses top 330,000\n',
    "12445 Child shock guidelines 'are deadly'\n",
    "12446 Green tea 'can impedes beta blocker'\n",
    '12447 VIDEO: India hails polio-free landmark\n',
    '12448 VIDEO: Shock tactics in fight on obesity\n',
    '12449 Ex-psychiatric patient recalls abuse\n',
    "12450 'Quicker' drug for breast cancer\n",
    '12451 AUDI

['12447 VIDEO: India hails polio-free landmark\n', True]
["12446 Green tea 'can impedes beta blocker'\n", True]
["12445 Child shock guidelines 'are deadly'\n", True]
['12444 UK cancer diagnoses top 330,000\n', True]
['12443 UK cancer diagnoses top 330,000\n', True]
['12442 Sharp rise in A&amp;E locum doctor spend\n', True]
['12441 Families to foster elderly people\n', True]
['12440 Viewpoints: How to curb obesity\n', True]
['12439 A&amp;E locum costs: Your experiences\n', True]
['12438 VIDEO: Bewitching poison: alcohol as medicine\n', True]
['12437 Women 50-plus urged to have smears\n', True]
["12436 Weightlifting 'cuts diabetes risk'\n", True]
['12435 VIDEO: Mental health issues rising - GPs\n', True]
['12434 VIDEO: More NHS contracts won by private firms\n', True]
["12433 Mental health cases 'rise in young'\n", True]
["12432 VIDEO: Japan's chubby women\n", True]
["12431 Comedians 'have psychotic traits\xe2\x80\x99\n", True]
["12430 Rates of gout in UK 'soaring'\n", True]
["12429 Gene

['9992 Ebola patient arrives in London\n', True]
['9991 VIDEO: How trials helped patients in 2014\n', True]
['9990 First Ebola case linked to bat play\n', True]
['9989 Plasma treatment for UK Ebola patient\n', True]
['9988 Frontline NHS staff given honours\n', True]
['9987 VIDEO: UK Ebola screening under scrutiny\n', True]
['9986 Ebola screening tests under scrutiny\n', True]
['9985 Experimental drug for UK Ebola nurse\n', True]
["9984 Babies of addicts 'display problems'\n", True]
["9983 VIDEO: 2014's medical milestone as man walks again\n", True]
["9982 VIDEO: Steps to beat 'hidden killer' sepsis\n", True]
["9981 Most cancer types 'just bad luck'\n", True]
["9980 VIDEO: World's 'epic battle' against Ebola\n", True]
["9979 Ebola flights' UK passengers traced\n", True]
["9978 Learning disability checks 'success'\n", True]
['9977 UK Ebola nurse critical - hospital\n', True]
['9976 Labour in new NHS attack on Tories\n', True]
["9975 VIDEO: Cameron: 'NHS is not unaffordable'\n", True]
['9

['7121 Extra pounds, extra lifespan? Why one study says yes:\n', True]
["7120 How bad are the U.S.'s most famous fattening foods? We take a look:\n", True]
['7119 Should I do #cardio or #weightlifitng first at the #gym? @HealthyLiving investigates:\n', True]
['7117 Unlocking the fountain of youth -- with your DNA?\n', True]
['7116 A #newstudy uncovers the repercussions of a younger drinking age:\n', True]
['7115 The 9 worst ways to treat #depression\n', True]
['7113 #Breastfeeding Still Less Common for Black Babies\n', True]
['7112 Keep your bones strong with these 7 exercises:\n', True]
['7111 Why alcohol + diet soda can lead to increased safety risks:\n', True]
['7110 The makers of Pom Wonderful pomegranate juice are not allowed to make any health claims, says the FTC:\n', True]
['7109 Anyone looking for tips on weight loss, working out, healthy eating and everything in between? join our new group here:\n', True]
['7108 When we say someone died of old age, what does that really mean?

['4781 #California college student diagnosed with #meningitis-related infection\n', True]
['4780 Best and worst foods to eat when you\xe2\x80\x99re #sick\n', True]
['4779 5 heart-healthy superfoods\n', True]
['4778 See how doctors #3D print a new trachea\n', True]
['4777 Man gets #bionic eye, sees family for first time in 10 years\n', True]
['4776 4 more #measles cases reported in #California\n', True]
['4775 US government report outlines foods most prone to pathogens\n', True]
['4774 #Ebola drug shows some promise in first tests in West Africa\n', True]
['4773 New screening tests for hard-to-spot breast #cancers\n', True]
['4772 Fist-clinching fury raises heart attack risk\n', True]
['4771 New York attorney general expands herbal #supplements probe\n', True]
['4770 Final frontier for school nutrition: Bake sales\n', True]
['4769 Merck grants free license for pediatric #HIV drug\n', True]
['4768 Endurance athletes may be drinking too much water, studies suggest\n', True]
["4767 World's

["2396 Brain teaser! Who is @CNN's favorite neurosurgeon? Hint: he's joining us for a live chat at 2:30 ET\n", True]
['2395 Is "salmon" the answer to every brain dilemma? Find out in 1 hr during live chat w/ @DrSanjayGupta\n', True]
['2394 Excited for @HopkinsMedicine @MayoClinic @RonPetersenMD and @YonasGeda to join us for our #AskSanjay chat at 2:30 ET!\n', True]
["2393 Hey there! @DrSanjayGupta here. I'm taking over this account until 3p ET to answer your questions about brain health. Let's chat! #AskSanjay\n", True]
['2392 . @lalaland7 there is no hard evidence ginkgo biloba or other supplements help ward off alz disease \xe2\x80\x93SG\n', True]
['2391 . @cooper243 the #brain often find ways to rewire around the damaged area. so your short-term memory may gradually improve. \xe2\x80\x93SG #AskSanjay\n', True]
['2390 i\xe2\x80\x99m seeing several q\xe2\x80\x99s about fish oil helping after a brain injury. yes, it can help heal the brain. -SG\n', True]
['2389 .@Bibiana_Ackbar @drsanj

['237 5 ways @CDCgov got #Ebola wrong:\n', True]
['236 RT @cnnbrk: Officials: Second health care worker at Texas hospital who cared for Thomas Eric Duncan tested positive for Ebola.\n', True]
['235 Man treated for @googleglass #addiction at Navy facility:\n', True]
['234 RT @CNN: A turnip + First Lady Michelle Obama cutting loose = One incredible Vine\n', True]
['233 Stem cells help nearly blind see\n', True]
['232 RT @AlexPappas: Passenger Wears A Hazmat Suit To Dulles Airport [PHOTO]\n', True]
['231 Are you wearing purple for #SpiritDay? They are:\n', True]
['230 Keep up to date on the latest #Ebola developments as soon as they are available:\n', True]
['229 RT @cnnbrk: Dallas #Ebola patient Nina Pham to be transferred to NIH facility in Maryland, sources say.\n', True]
['228 .@drsanjaygupta learns his great-grandfather was able to jump over a water buffalo in one leap\n', True]
['227 "Home is a feeling, for me, for my parents, for all of us." -@drsanjaygupta\n', True]
['226 What the

In [None]:
db = DBSCAN()
db.fit(new_data)
print("------------------------------------------------------")
print("SCORES\n")
labels = db.labels_
s = silhouette_score(new_data,labels)
# print("Cost: " + str(db.inertia_))
print("Silhouette Score for 100 Clusters is: " + str(s))
d = davies_bouldin_score(new_data,labels)
print("Davies Bouldin Score for 100 Clusters is: " + str(d))

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("There are " + str(n_clusters_) + " different clusters")

print(len(db.components_))
for i in range(n_clusters_):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(db.components_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=0)))