In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import NMF

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names())

['100', 'app', 'belly', 'best', 'came', 'cat', 'chrome', 'climbing', 'eating', 'extension', 'face', 'feedback', 'google', 'impressed', 'incredible', 'key', 'kitten', 'kitty', 'little', 'map', 'merley', 'ninja', 'open', 'photo', 'play', 'promoter', 'restaurant', 'smiley', 'squooshy', 'tab', 'taken', 'translate', 've']


In [2]:
n_topics = 2
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)

In [4]:
W = nmf.fit_transform(tfidf)
W

array([[0.        , 0.        ],
       [0.        , 0.45217213],
       [0.55735742, 0.        ],
       [0.49414046, 0.        ],
       [0.        , 0.74849032],
       [0.        , 0.5964714 ],
       [0.55735742, 0.        ],
       [0.52368298, 0.        ]])

In [6]:
H = nmf.components_
H.shape

(2, 33)

In [9]:
import numpy as np

# tfidf - np.dot(W, H)

In [10]:
n_top_words = 10
feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic #0:
google feedback map app impressed incredible translate key extension chrome
Topic #1:
cat best climbing ninja ve photo taken belly merley kitten


## LDA

In [23]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [44]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

# no_features = 100
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1).fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics).fit(tf)

no_top_words = 10
print('Topic Modelling with NMF:')
display_topics(nmf, tfidf_feature_names, no_top_words)
print(' ---- ')
print('Topic Modelling with LDA')
display_topics(lda, tf_feature_names, no_top_words)

Topic Modelling with NMF:
Topic 0:
google feedback map app impressed incredible translate key extension chrome
Topic 1:
cat best climbing ninja ve photo taken belly merley kitten
 ---- 
Topic Modelling with LDA
Topic 0:
google smiley translate restaurant tab promoter eating face feedback kitty
Topic 1:
cat best taken merley belly kitten squooshy ve ninja climbing




## Keras

In [46]:
from keras.preprocessing.text import Tokenizer
documents = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
tok = Tokenizer()
tok.fit_on_texts(documents)
mat_texts = tok.texts_to_matrix(documents, mode='tfidf')
print(mat_texts)
X = tok.texts_to_sequences(documents)
print(X)
print('Word Index:')
print(tok.word_index)
print('Word Counts:')
print(tok.word_counts)
print('Document Count:')
print(tok.document_count)
print('Words in Doc:')
print(tok.word_docs)

[[0.         0.58778666 0.58778666 0.58778666 0.69314718 0.84729786
  0.         0.         0.         0.        ]
 [0.         0.58778666 0.58778666 0.58778666 1.17360019 0.
  1.09861229 0.         0.         0.        ]
 [0.         0.58778666 0.58778666 0.58778666 0.         0.
  0.         1.09861229 1.09861229 1.09861229]
 [0.         0.58778666 0.58778666 0.58778666 0.69314718 0.84729786
  0.         0.         0.         0.        ]]
[[1, 2, 3, 5, 4], [1, 4, 2, 3, 6, 4], [7, 1, 2, 3, 8, 9], [2, 1, 3, 5, 4]]
Word Index:
{'this': 1, 'is': 2, 'the': 3, 'document': 4, 'first': 5, 'second': 6, 'and': 7, 'third': 8, 'one': 9}
Word Counts:
OrderedDict([('this', 4), ('is', 4), ('the', 4), ('first', 2), ('document', 4), ('second', 1), ('and', 1), ('third', 1), ('one', 1)])
Document Count:
4
Words in Doc:
defaultdict(<class 'int'>, {'is': 4, 'first': 2, 'this': 4, 'document': 3, 'the': 4, 'second': 1, 'one': 1, 'and': 1, 'third': 1})


## Spam Classification with Deep Learning

Text classification + Keras:

https://www.kaggle.com/psyhoo/spam-sms-neural-networks-in-keras

num_max: The entire vocabulary we have in corpus -> vocab_size

max_len: The maximum number of words per row (how many words a ham or spam email has) in corpus 

## Word2Vec Glove

In [None]:
def keyword_to_category_GloVe(keyword_list):
    dic = {}
    with codecs.open('glove.840B.300d.txt', 'r') as f:
    # with codecs.open('glove.6B.300d.txt', 'r', 'utf-8') as f:
        for c, r in enumerate(f):
            sr = r.split()
            # if sr[0] in keyword_list + category_list:
            if sr[0] in [i.encode() for i in keyword_list]:
                # print(sr[0])
                dic[sr[0]] = [float(i) for i in sr[1:]]
                # print(c)
                if len(dic) == len(keyword_list):
                    break
    category_list = pickle.load(open("category2.p", "rb"))
    category = {}
    for i in keyword_list:
        distance = []
        for j in category_list:
            distance.append([j, np.linalg.norm(np.array(dic[i])-np.array(category_list[j]))])
        di = [s[0] for s in distance]
        mi = [s[1] for s in distance]
        idx = mi.index(min(mi))
        category[i] = di[idx]
    return category

In [None]:
print(keyword_to_category_GloVe([u'runner', u'pizza', u'physics', u'adidas'])

## LDA: Which sentences is what document?

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle


# the dataset to predict on (first two samples were also in the training set so one can compare)
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]
# Vectorize the training set using the model features as vocabulary
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_topics=2).fit(tf)

# transform method returns a matrix with one line per document, columns being topics weight
predict = lda.transform(tf)
print(predict)

[[0.04624001 0.95375999]
 [0.06901926 0.93098074]
 [0.91056403 0.08943597]
 [0.0516827  0.9483173 ]
 [0.08450956 0.91549044]
 [0.85683624 0.14316376]
 [0.91108699 0.08891301]
 [0.92322557 0.07677443]]




In [6]:
m = tf_vectorizer.transform(['cat is kitty'])
lda.transform(m)

array([[0.62680148, 0.37319852]])