In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
documents = ["google plus",
             "google mail",
             "Google Translate app",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [4]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [5]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 cat
 best
 photo
 ninja
 climbing
 translate
 google
 app
 chrome
 extension
Cluster 1:
 google
 plus
 mail
 translate
 app
 map
 feedback
 impressed
 key
 promoter


In [6]:
print("\n")
print("Prediction")
 
Y = vectorizer.transform(["google to open."])
prediction = model.predict(Y)
print(prediction)
 



Prediction
[1]


In [7]:
Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

[0]


In [8]:
import nltk
from nltk.corpus import brown

In [9]:
print(brown.words()[0:10])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [10]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [11]:
s = "The quick brown fox jumped over the lazy dog"
print(s)


The quick brown fox jumped over the lazy dog


In [13]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

articleURL = "http://curia.europa.eu/juris/document/document.jsf?text=&docid=139407&pageIndex=0&doclang=EN&mode=lst&dir=&occ=first&part=1&cid=52454"

def getText(url):
    page = urlopen(url).read().decode('utf8', 'ignore')
    soup = BeautifulSoup(page, 'lxml')
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return text.encode('ascii', errors='replace').decode().replace("?","")

text = getText(articleURL)

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [2]:
import nltk
# nltk.download('punkt')
# nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict
from string import punctuation
from heapq import nlargest

def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    wordSent = word_tokenize(text.lower())
    stopWords = set(stopwords.words('english')+list(punctuation))
    
    wordSent= [word for word in wordSent if word not in stopWords]
    freq = FreqDist(wordSent)

    ranking = defaultdict(int)
    
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]

    sentsIDX = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sentsIDX)]

summaryArr = summarize(text, 10)
# summaryArr

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(summaryArr)
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)
np.unique(km.labels_, return_counts=True)

Initialization complete
Iteration  0, inertia 8.520
Iteration  1, inertia 4.633
Iteration  2, inertia 4.228
Converged at iteration 2: center shift 0.000000e+00 within tolerance 7.798248e-07


(array([0, 1, 2]), array([4, 2, 4], dtype=int64))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(summaryArr)
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)
np.unique(km.labels_, return_counts=True)

text={}
for i,cluster in enumerate(km.labels_):
    oneDocument = summaryArr[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument

stopWords = set(stopwords.words('english')+list(punctuation))
keywords = {}
counts={}

for cluster in range(3):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in stopWords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster]=freq

uniqueKeys={}
for cluster in range(3):   
    other_clusters=list(set(range(3))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    uniqueKeys[cluster]=nlargest(10, unique, key=counts[cluster].get)

print(uniqueKeys)

Initialization complete
Iteration  0, inertia 7.176
Iteration  1, inertia 4.194
Converged at iteration 1: center shift 0.000000e+00 within tolerance 7.798248e-07
{0: ['cultural', 'social', 'chamber', 'intended', 'finance', 'gmbh', 'amazon.com', 'institutions', 'excluded', 'received'], 1: ['presumption', 'establishment', 'means', 'clearly', 'rebuttable', 'devices', 'determining', 'digital', 'referred', 'equipment'], 2: ['society', 'reimbursement', 'national', 'applies', 'collecting', 'levies', '1', 'answered', 'remuneration', 'law']}
