In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import FunctionTransformer

from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


from time import time

In [2]:
from sklearn.pipeline import Pipeline

In [3]:
df_test = pd.read_csv('data/ufo.csv')

text = df_test.iloc[:, 5]
text

0         Light seen over mountain's east of Camp McGre...
1         Flying saucer descends, possibly lands in Nor...
2         While letting my dog out, a very bright white...
3         Orange round sphere.  Orange glowing sphere f...
4         Flying corkscrews  Looking to th east at abou...
                               ...                        
11926     reddish orange triangular pattern of lights. ...
11927     6 bright red glowing spheres.  Six red sphere...
11928     In daylight, oval object traveled across sky ...
11929     Triangular craft cary nc  On Sturdivant, saw ...
11930     I've been watching videos on Youtube then, I ...
Name: 4, Length: 11931, dtype: object

 [pipelinize source](https://evisionindia.wordpress.com/2020/03/06/setting-up-text-preprocessing-pipeline-using-scikit-learn-and-spacy-learn-how-to-tokenize-lemmatize-remove-stop-words-and-punctuation-with-sklearn-pipelines/)

In [5]:
from string import *

class PorterTokenizer:
     def __init__(self):
            
            self.ps = PorterStemmer()
            self.lower      = lower
            self.strip      = strip
            self.stopwords  = stopwords or set(sw.words('english'))
            self.punct      = punct or set(string.punctuation)
            self.lemmatizer = WordNetLemmatizer()
     def __call__(self, doc):
         return [self.ps.stem(t) for t in word_tokenize(doc)]

In [4]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk import PorterStemmer

In [8]:
from sklearn.decomposition import NMF

In [6]:
n_components = 10
n_features=10000

In [7]:
vect = TfidfVectorizer(stop_words='english', max_features=n_features
                       #, tokenizer=PorterTokenizer()
                      )
words = vect.fit_transform(text)
V = words.toarray()
V.shape

(11931, 10000)

In [9]:
nmf = NMF(n_components=n_components)
nmf.fit(V)
W = nmf.transform(V)
H = nmf.components_

nmf.reconstruction_err_

102.98245606833696

In [10]:
H.shape

(10, 10000)

In [11]:
feature_names = vect.get_feature_names()


In [12]:
index_val = np.argsort(H)[:, -1:-11:-1]

for i, lat_feat in enumerate(index_val):
    print('%d: %s'%(i+1,', '.join([feature_names[n] for n in lat_feat])))


1: saw, like, just, looked, went, time, house, look, ufo, got
2: information, provides, anonymous, elects, remain, pd, nuforc, note, contact, totally
3: lights, formation, triangle, blinking, white, moving, flashing, line, red, minutes
4: light, bright, white, sky, blue, flash, appeared, disappeared, seconds, missile
5: orange, east, west, disappeared, south, north, orbs, fireball, ball, glowing
6: object, shaped, appeared, sky, approximately, observed, white, shape, high, large
7: craft, flying, shaped, low, triangle, sound, flew, triangular, black, aircraft
8: red, green, blue, flashing, white, sky, hovering, colors, hovered, orb
9: star, moving, sky, like, fast, looked, stars, shooting, bright, moved
10: objects, flying, formation, sky, appeared, approximately, 10, observed, direction, time


In [13]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(words)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
done in 0.158139s
Explained variance of the SVD step: 6%



In [14]:
km = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1,
            verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=False)
done in 0.043s



NameError: name 'labels' is not defined

In [None]:
km.

In [None]:
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
#     else:
#         order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(n_features):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()