# TOPIC MODEL

## Imports

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.utils import simple_preprocess # converts a document into a list of tokens
from gensim.parsing.preprocessing import STOPWORDS # list of stop words
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk 
import pandas as pd

nltk.download('wordnet')

## Clearing and getting data ready

In [4]:
# load the data

data = pd.read_csv('india-news-headlines.csv', error_bad_lines=False)


In [9]:
# Retrieving only the Headlines text column for the data

data_text = data[['headline_text']]

data_text['index'] = data_text.index

data_text.head(10)

documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [12]:
print(len(documents))
print(documents[:10])

3424067
                                       headline_text  index
0  Status quo will not be disturbed at Ayodhya; s...      0
1                Fissures in Hurriyat over Pak visit      1
2              America's unwanted heading for India?      2
3                 For bigwigs; it is destination Goa      3
4               Extra buses to clear tourist traffic      4
5        Dilute the power of transfers; says Riberio      5
6                  Focus shifts to teaching of Hindi      6
7               IT will become compulsory in schools      7
8      Move to stop freedom fighters' pension flayed      8
9  Gilani claims he applied for passport 2 years ago      9


## Before we continue we need to handle few things:

* Tokenization: Splitting documents into sentences and words.
* Removing stop words like (the, a, an, in)
* Stemming: reducing words to their root form

In [13]:
# setting up the stemmer

stemmer = SnowballStemmer("english")


# returns the root value of the word

def lemmatize_stemming(word):    
    return stemmer.stem(WordNetLemmatizer().lemmatize(word,pos='v'))

# pull out the stopwords from the sentences

def preprocess(text):
    result = []
    for token in simple_preprocess(text):

        if (token not in STOPWORDS) and (len(token) > 3):

            result.append(lemmatize_stemming(token))

    return result


### Sample demonstration of the above functions

In [79]:
doc_sample = documents[documents['index'] == 6].values[0][0]
print("RAW:\n",doc_sample)

print('\n\noriginal document: ')
words = []
for word in doc_sample.split(' '): words.append(word)
print(words)
print("\n\nTokenized and lemmatized doc: ")
print(preprocess(doc_sample))

RAW:
 Focus shifts to teaching of Hindi


original document: 
['Focus', 'shifts', 'to', 'teaching', 'of', 'Hindi']


Tokenized and lemmatized doc: 
['focus', 'shift', 'teach', 'hindi']


# Apply process to the every sentence

In [37]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs

0                   [status, disturb, ayodhya, say, vajpaye]
1                                  [fissur, hurriyat, visit]
2                             [america, unwant, head, india]
3                                           [bigwig, destin]
4                      [extra, bus, clear, tourist, traffic]
5                     [dilut, power, transfer, say, riberio]
6                               [focus, shift, teach, hindi]
7                                       [compulsori, school]
8                    [stop, freedom, fighter, pension, flay]
9                     [gilani, claim, appli, passport, year]
10                                  [parivar, dismiss, warn]
11                             [india, exchang, list, plant]
12                             [qureshi, return, help, govt]
13                              [tacit, messag, tampl, hold]
14                             [text, prime, minist, articl]
15                                   [focus, violenc, women]
16                      

In [38]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10:
        break

0 ayodhya
1 disturb
2 say
3 status
4 vajpaye
5 fissur
6 hurriyat
7 visit
8 america
9 head
10 india


In [40]:
# filter words that appear under 15 docs and keep the most 10k frequent ones

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)

In [45]:
#now let see what the doc number 6 looks like
processed_docs[6]

['focus', 'shift', 'teach', 'hindi']

In [51]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(37, 1), (38, 1), (39, 1)]

In [78]:
bow_doc_6 = bow_corpus[6]
for i in range(len(bow_doc_6)): print("Word {} (\"{}\") appears {} time.".format(bow_doc_6[i][0],dictionary[bow_doc_6[i][0]], bow_doc_6[i][1]))

print(dictionary[bow_doc_6[3][0]])


Word 21 ("focus") appears 1 time.
Word 22 ("hindi") appears 1 time.
Word 23 ("shift") appears 1 time.
Word 24 ("teach") appears 1 time.
teach


[(21, 1), (22, 1), (23, 1), (24, 1)]

# We can run the model now

In [72]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [73]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx,topic))


Topic: 0 
Words: 0.028*"bihar" + 0.021*"lakh" + 0.018*"bengaluru" + 0.013*"hold" + 0.012*"seiz" + 0.011*"celebr" + 0.011*"girl" + 0.011*"special" + 0.011*"plea" + 0.010*"polic"
Topic: 1 
Words: 0.026*"rajasthan" + 0.025*"crore" + 0.023*"plan" + 0.021*"state" + 0.021*"help" + 0.015*"post" + 0.014*"near" + 0.013*"record" + 0.013*"punjab" + 0.011*"bengal"
Topic: 2 
Words: 0.037*"covid" + 0.025*"home" + 0.025*"hospit" + 0.024*"pune" + 0.021*"patient" + 0.021*"student" + 0.015*"onlin" + 0.015*"doctor" + 0.014*"colleg" + 0.014*"univers"
Topic: 3 
Words: 0.027*"india" + 0.022*"kolkata" + 0.022*"day" + 0.020*"hyderabad" + 0.016*"open" + 0.015*"test" + 0.015*"covid" + 0.012*"drive" + 0.012*"give" + 0.012*"come"
Topic: 4 
Words: 0.053*"year" + 0.027*"woman" + 0.027*"kill" + 0.022*"book" + 0.021*"mumbai" + 0.016*"die" + 0.015*"death" + 0.013*"life" + 0.012*"rain" + 0.010*"like"
Topic: 5 
Words: 0.021*"polic" + 0.018*"bodi" + 0.018*"congress" + 0.017*"poll" + 0.015*"meet" + 0.014*"karnataka" + 0.0

In [77]:
for index, score in sorted(lda_model[bow_doc_6], key= lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))


Score: 0.41999998688697815	 
Topic: 0.050*"case" + 0.030*"covid" + 0.028*"cop" + 0.025*"arrest" + 0.021*"hold"

Score: 0.2199999988079071	 
Topic: 0.053*"year" + 0.027*"woman" + 0.027*"kill" + 0.022*"book" + 0.021*"mumbai"

Score: 0.2199999988079071	 
Topic: 0.022*"maharashtra" + 0.022*"death" + 0.018*"worker" + 0.018*"get" + 0.017*"gujarat"

Score: 0.019999999552965164	 
Topic: 0.028*"bihar" + 0.021*"lakh" + 0.018*"bengaluru" + 0.013*"hold" + 0.012*"seiz"

Score: 0.019999999552965164	 
Topic: 0.026*"rajasthan" + 0.025*"crore" + 0.023*"plan" + 0.021*"state" + 0.021*"help"

Score: 0.019999999552965164	 
Topic: 0.037*"covid" + 0.025*"home" + 0.025*"hospit" + 0.024*"pune" + 0.021*"patient"

Score: 0.019999999552965164	 
Topic: 0.027*"india" + 0.022*"kolkata" + 0.022*"day" + 0.020*"hyderabad" + 0.016*"open"

Score: 0.019999999552965164	 
Topic: 0.021*"polic" + 0.018*"bodi" + 0.018*"congress" + 0.017*"poll" + 0.015*"meet"

Score: 0.019999999552965164	 
Topic: 0.038*"say" + 0.020*"tamil" + 

In [152]:
text1 = "There is a breaktrough in developing vaccine."
text2 = "There are changes in policies punishing murderers to prevent violence."
unseen_document = text2
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key= lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 7)))


Score: 0.15714287757873535	 
Topic: 0.021*"polic" + 0.018*"bodi" + 0.018*"congress" + 0.017*"poll" + 0.015*"meet" + 0.014*"karnataka" + 0.013*"elect"

Score: 0.15714286267757416	 
Topic: 0.026*"rajasthan" + 0.025*"crore" + 0.023*"plan" + 0.021*"state" + 0.021*"help" + 0.015*"post" + 0.014*"near"

Score: 0.15714286267757416	 
Topic: 0.038*"say" + 0.020*"tamil" + 0.019*"farmer" + 0.019*"water" + 0.019*"lockdown" + 0.018*"nadu" + 0.018*"high"

Score: 0.15714286267757416	 
Topic: 0.022*"maharashtra" + 0.022*"death" + 0.018*"worker" + 0.018*"get" + 0.017*"gujarat" + 0.016*"school" + 0.016*"famili"

Score: 0.15714284777641296	 
Topic: 0.037*"covid" + 0.025*"home" + 0.025*"hospit" + 0.024*"pune" + 0.021*"patient" + 0.021*"student" + 0.015*"onlin"

Score: 0.1571378856897354	 
Topic: 0.050*"case" + 0.030*"covid" + 0.028*"cop" + 0.025*"arrest" + 0.021*"hold" + 0.018*"posit" + 0.018*"telangana"

Score: 0.014290666207671165	 
Topic: 0.053*"year" + 0.027*"woman" + 0.027*"kill" + 0.022*"book" + 0.0

# Comments

As we have clearly seen the topic choice for unseen document "text2" and "text1" is correctly decided respectively to be the "polic" and "covid", which indicates that our model works fine. Now we can try to determine the topic of an unseen paragraph to further test our model. 

Paragraph:

- "Government! You can't live with it! You can't live without it! It is the "common cold" that everyone dreads. The American Heritage College Dictionary, Third Edition defines government as, "The exercise of authority in a political unit in order to control and administer public policy." Webster's Desk Dictionary of the English Language defines government as, "The political direction and control exercised over a nation, state, community, etc." The common individual might define government as the root of all evil. The thing about government is that no one stops to think about how government came about."

In [159]:
paragraph1 = """
Government! You can't live with it! You can't live without it! It is the "common cold" that everyone dreads. The American Heritage College Dictionary, Third Edition defines government as, "The exercise of authority in a political unit in order to control and administer public policy." Webster's Desk Dictionary of the English Language defines government as, "The political direction and control exercised over a nation, state, community, etc." The common individual might define government as the root of all evil. The thing about government is that no one stops to think about how government came about.
Government falls into two categories; monarchy or a republic. A monarchy is a form of government that is always headed by a…show more content…
King Fahad has complete control over it's citizens in all aspects pertaining to their country. Laws, punishments, and regulations are in the hands of King Fahad. 
If the government of a country does not fall into any of these categories, it is a republic. A republic is defined in Webster's Desk Dictionary of the English Language as any government in which the supreme power rests in the body of citizens entitled to vote and is exercised by representatives chosen directly or indirectly by them. There are three types of republics; dictatorship, oligarchy, or democracy. 
"""
bow_vector_para = dictionary.doc2bow(preprocess(paragraph1))
for index, score in sorted(lda_model[bow_vector_para], key= lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 7)))


Score: 0.21826614439487457	 
Topic: 0.034*"govern" + 0.028*"time" + 0.026*"road" + 0.022*"start" + 0.016*"pradesh" + 0.015*"return" + 0.011*"khan"

Score: 0.14241258800029755	 
Topic: 0.028*"bihar" + 0.021*"lakh" + 0.018*"bengaluru" + 0.013*"hold" + 0.012*"seiz" + 0.011*"celebr" + 0.011*"girl"

Score: 0.1255302131175995	 
Topic: 0.038*"say" + 0.020*"tamil" + 0.019*"farmer" + 0.019*"water" + 0.019*"lockdown" + 0.018*"nadu" + 0.018*"high"

Score: 0.11843184381723404	 
Topic: 0.037*"covid" + 0.025*"home" + 0.025*"hospit" + 0.024*"pune" + 0.021*"patient" + 0.021*"student" + 0.015*"onlin"

Score: 0.11246328800916672	 
Topic: 0.021*"polic" + 0.018*"bodi" + 0.018*"congress" + 0.017*"poll" + 0.015*"meet" + 0.014*"karnataka" + 0.013*"elect"

Score: 0.10346248000860214	 
Topic: 0.053*"year" + 0.027*"woman" + 0.027*"kill" + 0.022*"book" + 0.021*"mumbai" + 0.016*"die" + 0.015*"death"

Score: 0.06489361822605133	 
Topic: 0.050*"case" + 0.030*"covid" + 0.028*"cop" + 0.025*"arrest" + 0.021*"hold" + 

# Results

We can clearly see that even newspaper headline fed model can distinguish a text about government correctly. Better data can help model work more precise.