In [None]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import seaborn as sns
%matplotlib inline

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = {
    'id': [],
    'date': [],
    'doc': []
}
with open('./cnnhealth.txt', mode='r') as f:
    for line in f.readlines():
        data = line.rstrip().split('|')
        corpus['id'].append(data[0])
        corpus['date'].append(data[1])
        corpus['doc'].append(data[2])
    f.close()
        

### Text Preprocessing

In [None]:
def cleanDoc(doc) -> str:
    newDoc = doc.lower().rstrip()
    newDoc = re.sub(r'\s(https|http)[:/\d\w.]*', '', newDoc) # Remove links
    newDoc = re.sub(r'\s(@|#)[\w\d_]+', '', newDoc) # Remove Tags etc
    return newDoc

In [None]:
cleanDocs = [cleanDoc(x) for x in corpus['doc']]

In [60]:
filterWords = ['us']
def getLemmatizedSentences(doc):
    return ' '.join([lemmatizer.lemmatize(x) if x not in filterWords else x for x in word_tokenize(doc) if x not in string.punctuation])
    

In [63]:
lemmatizedSentences = (getLemmatizedSentences(x) for x in cleanDocs)

### TF-IDF - Sparse word matrix

In [73]:
tfidf = TfidfVectorizer(stop_words='english')

In [65]:
X = tfidf.fit_transform(lemmatizedSentences)

In [68]:
K = 2

In [69]:
model = KMeans(n_clusters=K, init='k-means++', max_iter=100, n_init=1)

In [70]:
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [71]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
tfidf.strip_accents

In [72]:
for i in range(K):
    print(f'Cluster {i}:'),
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}')

Cluster 0:
 rt
 say
 today
 tip
 new
 health
 kid
 know
 help
 brain
Cluster 1:
 html
 index
 hpt
 loss
 cancer
 he_c1
 weight
 health
 rt
 nation
