## Global constants

In [1]:
from os import getenv
env = getenv('DATASET_DIR')

NEWSGROUP_HOME = env if env is not None else '../datasets/'

## Necessary imports

In [2]:
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

from sklearn.datasets import fetch_20newsgroups

## Download 20 newsgroups dataset

In [3]:
train = fetch_20newsgroups(subset='train', data_home=NEWSGROUP_HOME)
test = fetch_20newsgroups(subset='test', data_home=NEWSGROUP_HOME)

In [4]:
nlp = spacy.load('en_core_web_sm')

## Add additional stopwords

In [5]:
stop_list = [
    "subject",
    "from",
    "/",
    "(",
    ")",
    ":",
    "nntp",
    "posting",
    "host",
    "lines",
    "organization",
    "keywords",
    "distribution",
    "news",
    "software",
    "university",
    "$",
    "s",
    ">",
    "|",
    "=",
    "nt",
    "o",
    "article",
    
]
nlp.Defaults.stop_words.update(stop_list)
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

## Filter out emails and other irrelevant stuff from the texts

In [6]:
def remove_stopwords(doc):
    doc = [token.lemma_ for token in doc
           if not (token.is_stop or
                   token.is_punct or
                   token.like_email or
                   token.like_url or
                   token.is_space or
                   token.like_num or
                   token.lemma_.lower() in stop_list)]
    return doc

nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [7]:
# print(*train['data'][:15])

## Process texts, make a dictionary and a corpus

In [8]:
import re

for i, sent in enumerate(train['data']):
    sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
    sent = re.sub('\s+', ' ', sent)  # remove newline chars
    sent = re.sub("\'", "", sent)  # remove single quotes
    train['data'][i] = sent

In [9]:
doc_lst = list(map(nlp, train['data']))
dictionary = Dictionary(doc_lst)
corpus = [dictionary.doc2bow(doc) for doc in doc_lst]

## Fit LDA model

In [11]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary,
                     num_topics=10, random_state=2,
                     update_every=1, passes=10,
                     alpha='auto', per_word_topics=True)

In [12]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=train['data']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=train['data'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6.0,0.9825,"write, like, think, know, be, time, good, go, ...",From: (wheres my thing) Subject: WHAT car is t...
1,1,7.0,0.514,"drive, card, system, use, disk, problem, drive...",From: (Guy Kuo) Subject: SI Clock Poll - Final...
2,2,6.0,0.651,"write, like, think, know, be, time, good, go, ...",From: (Thomas E Willis) Subject: PB questions....
3,3,6.0,0.4982,"write, like, think, know, be, time, good, go, ...",From: (Joe Green) Subject: Re: Weitek P9000 ? ...
4,4,6.0,0.4985,"write, like, think, know, be, time, good, go, ...",From: (Jonathan McDowell) Subject: Re: Shuttle...
5,5,6.0,0.8068,"write, like, think, know, be, time, good, go, ...",From: (Foxvog Douglas) Subject: Re: Rewording ...
6,6,1.0,0.5739,"x, file, X, program, use, window, image, `, ve...",From: (brian manning delaney) Subject: Brain T...
7,7,7.0,0.9276,"drive, card, system, use, disk, problem, drive...",From: (GRUBB) Subject: Re: IDE vs SCSI Organiz...
8,8,1.0,0.4464,"x, file, X, program, use, window, image, `, ve...",From: Subject: WIn 3.0 ICON HELP PLEASE! Organ...
9,9,7.0,0.4253,"drive, card, system, use, disk, problem, drive...",From: (Stan Kerr) Subject: Re: Sigma Designs D...
