## Global constants

In [5]:
from os import getenv
env = getenv('DATASET_DIR')

NEWSGROUP_HOME = env if env is not None else '../datasets/'

## Necessary imports

In [6]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
from gensim.corpora.dictionary import Dictionary

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

from sklearn.datasets import fetch_20newsgroups

## Download 20 newsgroups dataset

In [7]:
train = fetch_20newsgroups(subset='train', data_home=NEWSGROUP_HOME)
test = fetch_20newsgroups(subset='test', data_home=NEWSGROUP_HOME)

In [8]:
print(train['data'][35], sep="\n")

From: dchhabra@stpl.ists.ca (Deepak Chhabra)
Subject: Re: Goalie masks
Nntp-Posting-Host: stpl.ists.ca
Organization: Solar Terresterial Physics Laboratory, ISTS
Lines: 21

In article <120666@netnews.upenn.edu> kkeller@mail.sas.upenn.edu (Keith Keller) writes:
>My vote goes to John Vanbiesbrouck.  His mask has a skyline of New York
>City, and on the sides there are a bunch of bees (Beezer).  It looks
>really sharp.

Funny you should mention this; one time on HNIC Don Cherry pointed out
Vanbiesbrouck's mask.  He _hated_ it.  I think he said something to the effect
of:
"You see?  He was great last year; now he goes out and gets that dopey mask 
and he can't stop a beachball!"

You may or may not take Cherry seriously at all, but I cracked up when I heard
it.

I think Ed Belfour has the current best mask in the NHL btw.  I also like
Moog's, and I'll give Fuhr's new one an honourable mention, although I haven't
seen it closely yet (it looked good from a distance!).  What's also neat is
Chev

## Add additional stopwords

In [9]:
nlp = spacy.load('en_core_web_sm', disable=["tagger", "parser", "ner"])
stop_list = [
    "subject",
    "from",
    "/",
    "(",
    ")",
    ":",
    "re",
    "nntp",
    "posting",
    "host",
    "lines",
    "write",
    "organization",
    "keyword",
    "distribution",
    "news",
    "software",
    "university",
    "like",
    "think",
    "+",
    "$",
    "s",
    ">",
    "<",
    "C",
    "year",
    "|",
    "=",
    "nt",
    "o",
    "article",
    
]
nlp.Defaults.stop_words.update(stop_list)
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

## Filter out emails and other irrelevant stuff from the texts

In [10]:
def remove_stopwords(doc):
    doc = [token.lemma_ for token in doc
           if not (token.is_stop or
                   token.is_punct or
                   token.like_email or
                   token.like_url or
                   token.is_space or
                   token.like_num or
                   token.lemma_.lower() in stop_list)]
    return doc

nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [11]:
# print(*train['data'][:15])

## Process texts, make a dictionary and a corpus

In [12]:
import re

# doc_lst = []

for i, sent in enumerate(train['data']):
    sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
    sent = re.sub('\s+', ' ', sent)  # remove newline chars
    sent = re.sub("\'", "", sent)  # remove single quotes
    # sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
    # doc_lst.append(sent)
    train['data'][i] = sent

In [13]:
doc_lst = list(nlp.pipe(train['data']))  # Limit number of entries for quicker analysis
dictionary = Dictionary(doc_lst)
corpus = [dictionary.doc2bow(doc) for doc in doc_lst]

In [14]:
print(*doc_lst[:3], sep="\n")

['thing', 'car', 'Maryland', 'College', 'Park', 'wonder', 'enlighten', 'car', 'see', 'day', '2-door', 'sport', 'car', 'look', 'late', '60s/', 'early', '70s', 'call', 'Bricklin', 'door', 'small', 'addition', 'bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'engine', 'spec', 'production', 'car', 'history', 'info', 'funky', 'look', 'car', 'e', 'mail', 'Thanks', 'IL', 'bring', 'neighborhood', 'Lerxst']
['Guy', 'Kuo', 'SI', 'Clock', 'Poll', 'Final', 'Summary', 'Final', 'SI', 'clock', 'report', 'Keywords', 'SI', 'acceleration', 'clock', 'upgrade', 'I.D.', 'shelley.1qvfo9INNc3s', 'Washington', 'fair', 'numb', 'brave', 'soul', 'upgrade', 'SI', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detail', 'experience', 'procedure', 'speed', 'attain', 'CPU', 'rate', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'day', 'floppy', 'disk', 'functionality', 'be', 'floppy', 'especially', 'request', 'summarize', 'day', 'add', 'network'

## Fit NMF model

In [15]:
nmf = Nmf(corpus=corpus, id2word=dictionary,
          num_topics=20, normalize=True,
          random_state=2)

In [16]:
nmf.print_topics()

[(0,
  '0.006*"File" + 0.006*"program" + 0.006*"file" + 0.005*"drive" + 0.005*"work" + 0.005*"use" + 0.004*"image" + 0.004*"have" + 0.004*"time" + 0.003*"launch"'),
 (1,
  '0.015*"Armenian" + 0.014*"Turkish" + 0.009*"Jew" + 0.009*"people" + 0.005*"Turkey" + 0.004*"say" + 0.004*"Turk" + 0.004*"book" + 0.003*"come" + 0.003*"Soviet"'),
 (2,
  '0.011*"say" + 0.008*"know" + 0.007*"go" + 0.006*"people" + 0.006*"come" + 0.006*"available" + 0.005*"tell" + 0.004*"time" + 0.004*"include" + 0.004*"Armenian"'),
 (3,
  '0.011*"`" + 0.008*"DOS" + 0.007*"Armenian" + 0.006*"know" + 0.006*"say" + 0.006*"people" + 0.006*"go" + 0.003*"come" + 0.003*"want" + 0.003*"time"'),
 (4,
  '0.832*"AX" + 0.059*"MAX" + 0.007*"G)R" + 0.004*"G9V" + 0.002*"GIZ" + 0.001*"M" + 0.001*"G" + 0.001*"T" + 0.001*"MR" + 0.000*"`"'),
 (5,
  '0.033*"`" + 0.011*"anonymous" + 0.009*"privacy" + 0.009*"internet" + 0.009*"use" + 0.008*"email" + 0.007*"information" + 0.007*"user" + 0.007*"file" + 0.007*"system"'),
 (6,
  '0.014*"File" 

## Wordcloud visualization

In [4]:
%%pixie_debugger

# Something here constantly kills the kernel…
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_list,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = nmf.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=40)
    plt.gca().imshow(cloud, interpolation='bilinear')
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
#plt.show()

UsageError: Cell magic `%%pixie_debugger` not found.


## Topic coherence metric value

In [17]:
coherence_nmf = CoherenceModel(model=nmf, texts=doc_lst, dictionary=dictionary, coherence='c_v')
coherence = coherence_nmf.get_coherence()
coherence

0.5314442072480839