In [10]:
import pandas as pd
import numpy as np

import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import FunctionTransformer

from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF


from time import time

In [11]:
from sklearn.pipeline import Pipeline

In [12]:
df = pd.read_csv('data/ufo.csv')
df.head()

Unnamed: 0,Datetime,Location,Shape,Duration,Text,Notes
0,5/6/2017 05:00,"Camp McGregor, NM",Light,10 minute,Light seen over mountain's east of Camp McGre...,Report appears to us to be consistent with t...
1,5/5/2017 11:30,"Austin, TX",Disk,3 second,"Flying saucer descends, possibly lands in Nor...",We would like to communicate with this witne...
2,5/4/2017 21:27,"Phoenix, AZ",Circle,15 second,Orange round sphere. Orange glowing sphere f...,"We have amended the time above, to reflect a..."
3,5/4/2017 18:30,"Phoenix, AZ",Teardrop,5 minute,Flying corkscrews Looking to th east at abou...,Source of the report elects to remain anonymous
4,5/4/2017 04:50,"Taft, CA",Changing,20 second,I'm a truck driver and I've seen the reddish/...,"Witness indicates ""Taft, Indiana"" in origina..."


In [13]:
text_pre = df.iloc[:, 4]
text_pre

0        Light seen over mountain's east of Camp McGre...
1        Flying saucer descends, possibly lands in Nor...
2        Orange round sphere.  Orange glowing sphere f...
3        Flying corkscrews  Looking to th east at abou...
4        I'm a truck driver and I've seen the reddish/...
                              ...                        
2731     Brightly lit craft flew and hovered right in ...
2732     Triangle bright flashing white light, releasi...
2733     Light, orange, red, fast speed.  My gf and I ...
2734     Three circular, flashing UFO's moving in erra...
2735     Fireball came in from east and made sharp tur...
Name: Text, Length: 2736, dtype: object

In [14]:
for idx, doc in enumerate(df.iloc[:, 4]):
    #df.iloc[:, 5].str.replace('\d+', '') # for digits
    df.iloc[idx, 4] = doc.lower()

In [15]:
# Clean up data for NLP algorithm. Could also use df.apply(lambda x)
df.iloc[:, 4] = df.iloc[:, 4].str.replace('\d+', '') # for digits
df.iloc[:, 4] = df.iloc[:, 4].str.replace(r'(\b\w{1,2}\b)', '') # for words
#df.iloc[:, 5] = df.iloc[:, 5].str.replace('[^\w\s]', '') # for punctuation 
df.iloc[:, 4] = df.iloc[:, 4].str.replace(r'[^\w\s]+', '')
df.iloc[:, 4] = df.iloc[:, 4].str.lower()
#df.iloc[:, 5] = df.iloc[:, 5].str.replace(string.punctuation, '') # for punctuation 

In [16]:
text = df.iloc[:, 4]
text

0        light seen over mountain east  camp mcgregor ...
1        flying saucer descends possibly lands  north ...
2        orange round sphere  orange glowing sphere fl...
3        flying corkscrews  looking   east  about   sa...
4          truck driver and  seen the reddishorange ba...
                              ...                        
2731     brightly lit craft flew and hovered right  fr...
2732     triangle bright flashing white light releasin...
2733     light orange red fast speed    and  were look...
2734     three circular flashing ufo moving  erratic p...
2735     fireball came  from east and made sharp turn ...
Name: Text, Length: 2736, dtype: object

 [pipelinize source](https://evisionindia.wordpress.com/2020/03/06/setting-up-text-preprocessing-pipeline-using-scikit-learn-and-spacy-learn-how-to-tokenize-lemmatize-remove-stop-words-and-punctuation-with-sklearn-pipelines/)

In [17]:
# This both tokenizes and lemmatizes 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [18]:
n_components = 5
n_features=10000

In [19]:
vect = TfidfVectorizer(stop_words='english', max_features=n_features,
                       ngram_range=(1,2),
                       tokenizer=LemmaTokenizer(),
                       strip_accents = 'unicode', # works 
                       lowercase = True, # works
                       max_df = 0.5, # works
                       min_df = 10 # works
                       #, tokenizer=PorterTokenizer()
                      )
words = vect.fit_transform(text)
V = words.toarray()
V.shape

  'stop_words.' % sorted(inconsistent))


(2736, 3098)

## Topic Modeling with NMF

In [20]:
nmf = NMF(n_components=n_components)
nmf.fit(V)
W = nmf.transform(V)
H = nmf.components_

nmf.reconstruction_err_

50.17499044248322

In [21]:
H.shape

(5, 3098)

In [22]:
feature_names = vect.get_feature_names()
feature_names

['able',
 'abruptly',
 'absolutely',
 'absolutely sound',
 'accelerated',
 'according',
 'account',
 'accurate',
 'activity',
 'actual',
 'actually',
 'add',
 'addendum',
 'addendum nuforc',
 'addendum witness',
 'additional',
 'address',
 'advertising',
 'advertising light',
 'afb',
 'afraid',
 'afternoon',
 'age',
 'ago',
 'agree',
 'agreed',
 'ahead',
 'air',
 'air craft',
 'air force',
 'air nuforc',
 'air traffic',
 'air wa',
 'aircraft',
 'aircraft flying',
 'aircraft light',
 'aircraft seen',
 'aircraft sky',
 'aircraft wa',
 'airforce',
 'airline',
 'airliner',
 'airplane',
 'airplane helicopter',
 'airplane light',
 'airplane wa',
 'airport',
 'alien',
 'alleged',
 'alternating',
 'altitude',
 'altitude wa',
 'amateur',
 'amazed',
 'amazing',
 'amber',
 'amended',
 'amended date',
 'amended time',
 'amp',
 'angeles',
 'angle',
 'anomaly',
 'anonymous provides',
 'answer',
 'anymore',
 'apart',
 'apartment',
 'app',
 'apparent',
 'apparently',
 'appear',
 'appearance',
 'appear

In [23]:
index_val = np.argsort(H)[:, -1:-11:-1]

for i, lat_feat in enumerate(index_val):
    print('%d: %s'%(i+1,', '.join([feature_names[n] for n in lat_feat])))


1: like, saw, just, craft, date, looked, sighting, time, approximate, indicates
2: provides, information, contact information, provides contact, contact, anonymous provides, witness elect, red, orange, moving
3: launch, missile, missile launch, note navy, navy missile, navy, blue, bright, white, cloud
4: minute, minute later, minute nuforc, later, sky minute, watched, stationary, minute moved, went, bright
5: object, object wa, second, appeared, east, west, flying, moving, video, photo


## Topic Modeling with SVD

In [24]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(words)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
done in 0.030928s
Explained variance of the SVD step: 4%



In [25]:
km = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1,
            verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
# print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# print("Adjusted Rand-Index: %.3f"
#       % metrics.adjusted_rand_score(labels, km.labels_))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, km.labels_, sample_size=1000))

# print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=False)
done in 0.029s



In [26]:
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
#     else:
#         order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vect.get_feature_names()
for i in range(1, n_components +1):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 1: provides information provides contact contact information contact
Cluster 2: object witness elect provides information contact information
Cluster 3: launch missile missile launch note navy navy missile
Cluster 4: like object saw bright craft
Cluster 5: minute bright object moving like
