In [1]:
#Import packages
import pandas as pd
import numpy as np
import glob
import re
import string
import codecs

In [4]:
#Create single parliament_qs dataframe with all question data
li = []

for file in glob.glob('./Parliament_Qs/rajyasabha_questions_and_answers_*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
parliament_qs = pd.concat(li, axis = 0, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [2]:
#Code borrowed and adapted from George Chen, Carnegie Mellon University#
#Define function to remove punctuation and whitespace, and lowercase all text
def makeWordList(str_object):
    
    corpus_text = str(str_object)
    
    for c in string.punctuation:
        corpus_text = corpus_text.replace(c, "")  # -- (1)
    
    text = re.sub(r'\S*\d\S*','',corpus_text) # -- (2)
    text = re.sub(r'[^\w\s]','',text)         # -- (3)
    
    text = text.lower().split()           # -- (4)         
    
    li = []
    for token in text:
        li.append(token)

    return " ".join(li)

In [42]:
#Process the questions
processed_questions = []

for str_object in list(parliament_qs["question_description"]):
    processed_questions.append(makeWordList(str_object))

In [5]:
#Process responses
processed_answers = []

for str_object in list(parliament_qs["answer"]):
    processed_answers.append(makeWordList(str_object))

In [44]:
#Use TfidfVectorizer to transform parliamentary questions
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
questions_fit = vectorizer.fit(processed_questions)
X_questions = vectorizer.fit_transform(processed_questions).toarray()

In [None]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda.fit(X_questions)

In [None]:
#Display top 10 words from each topic
words = list(questions_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

In [6]:
#Use TfidfVectorizer to transform parliamentary answers
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
answers_fit = vectorizer.fit(processed_answers)
X_answers = vectorizer.fit_transform(processed_answers).toarray()

In [7]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 1

from sklearn.decomposition import LatentDirichletAllocation
lda_answers = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_answers.fit(X_answers)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=1, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [8]:
#Display top 10 words from each topic
words = list(answers_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_answers.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
load : 0.005080913309441476
putting : 0.004528057922306874
combat : 0.00337049316574227
tenders : 0.0032544913669188755
pointed : 0.0030894443827827107
input : 0.0030116696759138707
right : 0.002530522656239383
manohar : 0.002518004393962288
matters : 0.0024367377372698845
cashless : 0.0024249497925367225



In [None]:
lda_answers.components_

In [13]:
#Print religious word occurances
words = list(answers_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_answers.components_])
res = []
for religious_word in religious_vocab:
    if religious_word in words:
        res.append(words.index(religious_word))
#res = [words.index(religious_word) for religious_word in religious_vocab]

#print('Displaying the top 10 words per topic and their probabilities within the topic...')
#print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for i in res:
        print(words[i], ':', topic_word_distributions[topic_idx, i])
    print()

[Topic 0]
religious : 9.899338423836855e-05
muslim : 0.00013176865515226625
temple : 0.00014931508323882134



# Headline Analysis

In [26]:
li = []

for file in glob.glob('./india_headlines_data/*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
headlines = pd.concat(li, axis = 0, ignore_index = True)

In [16]:
#Process headlines, delete headlines object, sample 10% of processed headlines
import random
processed_headlines = []
random.seed(42)
headlines = random.sample(list(headlines["headline_text"]), round(len(headlines)/10))
                          
for str_object in headlines:
    processed_headlines.append(makeWordList(str_object))
del headlines

In [17]:
#Use TfidfVectorizer to transform headlines
##Memory intensive##
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
headlines_fit = vectorizer.fit(processed_headlines)
X_headlines = vectorizer.fit_transform(processed_headlines).toarray()

In [18]:
num_topics = 1

from sklearn.decomposition import LatentDirichletAllocation
lda_headlines = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_headlines.fit(X_headlines)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=1, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [19]:
words = list(headlines_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_headlines.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
jharkhand : 0.0065510867561133865
buildings : 0.006426498052493539
bollywood : 0.005626572128150678
scheme : 0.005334077461047131
nepal : 0.005098348272174378
mission : 0.00466473557659236
courts : 0.004617878634466528
turn : 0.004236775877263603
diesel : 0.004174504895286561
release : 0.0041646727805514806



In [22]:
religious_vocab = ['religion', 'religious', 'hindu', 'hinduism',
                  'islam', 'muslim', 'christianity', 'christian', 'sikh',
                  'sikhism', 'temple', 'mosque', 'church', 'divine', 'god', 'gods',
                  'prayer', 'prayers', 'priest', 'clergy', 'imam', 'monk', 'dharma',
                  'vedas', 'worship', 'worshippers', 'worshipers' 'worshipper', 'worshiper',
                   'purity', 'nationalism', 'cleansing', 'ethnic'
                  ]

In [23]:
#Print religious word occurances
words = list(headlines_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_headlines.components_])

res = []
for religious_word in religious_vocab:
    if religious_word in words:
        res.append(words.index(religious_word))
#res = [words.index(religious_word) for religious_word in religious_vocab]

#print('Displaying the top 10 words per topic and their probabilities within the topic...')
#print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for i in res:
        print(words[i], ':', topic_word_distributions[topic_idx, i])
    print()

[Topic 0]
hindu : 0.0005553395375425462
muslim : 0.0004621399519305284
sikh : 0.0009201208716710891
temple : 0.00035964578241502596

