Ref: - https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word-727b6cf723cf
https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html

In [108]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from tqdm import tqdm
import spacy
from gensim import corpora, models
import pandas as pd
import sys
sys.path.append('..')
from utils import preprocess, get_windows
from nltk.tokenize.treebank import TreebankWordDetokenizer
import os


In [109]:
nlp = spacy.load('en')

In [156]:
dataset = pd.read_csv('articles.csv')
docs = []
df_docs = dataset[dataset['text'].notnull()]
"""for row_val in df_docs['body']:
    val = str(row_val).split(',')
    docs.append(''.join(val))
    """
df_docs['text'] = df_docs['text'].str.replace('\n',' ')
docs = df_docs['text']
df_docs.head()


Unnamed: 0,text,Label
0,UK economy facing 'major risks' The UK manufa...,Business
1,Aids and climate top Davos agenda Climate cha...,Business
2,Asian quake hits European shares Shares in Eu...,Business
3,India power shares jump on debut Shares in In...,Business
4,Lacroix label bought by US firm Luxury goods ...,Business


In [157]:
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    #print(docs[idx])
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]
#docs

In [158]:

from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [159]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=10)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
            
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

docs = [[token for token in doc if not token in stop_words] for doc in docs]

In [160]:
docs[0]

['economy',
 'facing',
 'major',
 'risk',
 'manufacturing',
 'sector',
 'continue',
 'face',
 'serious',
 'challenge',
 'next',
 'two',
 'year',
 'british',
 'chamber',
 'commerce',
 'bcc',
 'ha',
 'said',
 'group',
 'quarterly',
 'survey',
 'company',
 'found',
 'export',
 'picked',
 'last',
 'three',
 'month',
 'best',
 'level',
 'eight',
 'year',
 'rise',
 'came',
 'despite',
 'exchange',
 'rate',
 'cited',
 'major',
 'concern',
 'however',
 'bcc',
 'found',
 'whole',
 'economy',
 'still',
 'faced',
 'major',
 'risk',
 'warned',
 'growth',
 'set',
 'slow',
 'recently',
 'forecast',
 'economic',
 'growth',
 'slow',
 'little',
 'manufacturer',
 'domestic',
 'sale',
 'growth',
 'fell',
 'back',
 'slightly',
 'quarter',
 'survey',
 'firm',
 'found',
 'employment',
 'manufacturing',
 'also',
 'fell',
 'job',
 'expectation',
 'lowest',
 'level',
 'year',
 'despite',
 'positive',
 'news',
 'export',
 'sector',
 'worrying',
 'sign',
 'manufacturing',
 'bcc',
 'said',
 'result',
 'reinforce'

In [178]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.7)

In [179]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [180]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 9032
Number of documents: 2225


In [181]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 10
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize,
                       alpha='auto', eta='auto', num_topics=num_topics)

In [182]:
n_topics = 5
for i, topics in model.show_topics(n_topics, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))

topic 0 : service people firm year company could software new system also
topic 1 : music technology people also game new would player year gadget
topic 2 : people year game would also one say new get service
topic 3 : game year nintendo console also sony first new like title
topic 4 : mobile people phone user could technology one new net year


In [185]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=docs):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=model, corpus=corpus, texts=docs)


In [186]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,2.0,0.4678,"people, year, game, would, also, one, say, new...","[economy, facing, major, risk, manufacturing, ..."
1,2.0,0.8095,"people, year, game, would, also, one, say, new...","[aid, climate, top, davos, agenda, climate, ch..."
2,0.0,0.7461,"service, people, firm, year, company, could, s...","[asian, quake, hit, european, share, share, eu..."
3,0.0,0.8840,"service, people, firm, year, company, could, s...","[india, power, share, jump, debut, share, indi..."
4,2.0,0.6290,"people, year, game, would, also, one, say, new...","[lacroix, label, bought, firm, luxury, good, g..."
5,0.0,0.7581,"service, people, firm, year, company, could, s...","[insurance, boss, plead, guilty, another, thre..."
6,0.0,0.7082,"service, people, firm, year, company, could, s...","[turkey, iran, mobile, deal, risk, turkey, inv..."
7,0.0,0.9344,"service, people, firm, year, company, could, s...","[parmalat, return, stockmarket, parmalat, ital..."
8,2.0,0.7551,"people, year, game, would, also, one, say, new...","[worldcom, director, admits, lying, former, ch..."
9,2.0,0.7813,"people, year, game, would, also, one, say, new...","[ebbers, denies, worldcom, fraud, former, worl..."
