In [2]:
#Import packages
import pandas as pd
import numpy as np
import glob
import re
import string
import codecs

In [3]:
#Create single parliament_qs dataframe with all question data
li = []

for file in glob.glob('./Parliament_Qs/rajyasabha_questions_and_answers_*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
parliament_qs = pd.concat(li, axis = 0, ignore_index = True)

In [9]:
#Code borrowed and adapted from George Chen, Carnegie Mellon University#
#Define function to remove punctuation and whitespace, and lowercase all text
def makeWordList(str_object):
    
    corpus_text = str(str_object)
    
    for c in string.punctuation:
        corpus_text = corpus_text.replace(c, "")  # -- (1)
    
    text = re.sub(r'\S*\d\S*','',corpus_text) # -- (2)
    text = re.sub(r'[^\w\s]','',text)         # -- (3)
    
    text = text.lower().split()           # -- (4)         
    
    li = []
    for token in text:
        li.append(token)

    return " ".join(li)

In [None]:
#Process the questions
processed_questions = []

for str_object in list(parliament_qs["question_description"]):
    processed_questions.append(makeWordList(str_object))

In [None]:
#Process responses
processed_answers = []

for str_object in list(parliament_qs["answer"]):
    processed_answers.append(makeWordList(str_object))

In [None]:
#Use TfidfVectorizer to transform parliamentary questions
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
questions_fit = vectorizer.fit(processed_questions)
X_questions = vectorizer.fit_transform(processed_questions).toarray()

In [None]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda.fit(X_questions)

In [None]:
#Display top 10 words from each topic
words = list(questions_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

In [None]:
#Use TfidfVectorizer to transform parliamentary answers
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
answers_fit = vectorizer.fit(processed_answers)
X_answers = vectorizer.fit_transform(processed_answers).toarray()

In [None]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_answers = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_answers.fit(X_answers)

In [None]:
#Display top 10 words from each topic
words = list(answers_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_answers.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

In [None]:
lda_answers.components_

In [None]:
#Print religious word occurances
words = list(answers_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_answers.components_])
res = [words.index(religious_word) for religious_word in religious_vocab]

#print('Displaying the top 10 words per topic and their probabilities within the topic...')
#print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for i in res:
        print(words[i], ':', topic_word_distributions[topic_idx, i])
    print()

# Headline Analysis

In [5]:
li = []

for file in glob.glob('./india_headlines_data/*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
headlines = pd.concat(li, axis = 0, ignore_index = True)

In [6]:
#Process headlines, delete headlines object, sample 10% of processed headlines
import random
processed_headlines = []
random.seed(42)
headlines = random.sample(list(headlines["headline_text"]), round(len(headlines)/10))
                          
for str_object in headlines:
    processed_headlines.append(makeWordList(str_object))
del headlines

In [7]:
#Use TfidfVectorizer to transform headlines
##Memory intensive##
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
headlines_fit = vectorizer.fit(processed_headlines)
X_headlines = vectorizer.fit_transform(processed_headlines).toarray()

In [8]:
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_headlines = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_headlines.fit(X_headlines)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
words = list(headlines_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_headlines.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
labour : 0.03860330906379886
mission : 0.03374031671009788
course : 0.032462592063320285
cheating : 0.02674135554586314
filed : 0.02570318762380012
education : 0.01826181535798846
cop : 0.017738767759230903
scare : 0.014018263958041628
chhattisgarh : 0.013574345666579408
bar : 0.013280687501206163

[Topic 1]
bollywood : 0.05652672305509811
mc : 0.020877611782112865
chinese : 0.01951978015539641
cell : 0.01862620140313285
fuel : 0.018525507570669772
common : 0.0182746225720331
fears : 0.018235411934387834
caught : 0.018131542869644003
kochi : 0.017012086919733094
trouble : 0.016547194568394756

[Topic 2]
jharkhand : 0.055251359928895634
years : 0.03599075911791327
nepal : 0.033688461281122864
extended : 0.02997444049795326
fun : 0.022083075481717694
improve : 0.021661434481104202
small : 0.02118834296957338
long : 0.02072460553727788
asks : 0.020401371771194214
boycott : 0.0155555559639116

[Top

In [1]:
religious_vocab = ['religion', 'religious', 'hindu', 'hinduism',
                  'islam', 'muslim', 'christianity', 'christian', 'sikh',
                  'sikhism', 'temple', 'mosque', 'church', 'divine', 'god', 'gods',
                  'prayer', 'prayers', 'priest', 'clergy', 'imam', 'monk', 'dharma',
                  'vedas', 'worship', 'worshippers', 'worshipers' 'worshipper', 'worshiper', 'ayodhya',
                   'babri', 'hindutva','lynching','ethnic', 'purity','nationalism', 'nationalist',
                   'RSS', 'Sangh'
                  ]

In [10]:
#Print religious word occurances
words = list(headlines_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_headlines.components_])

res = []
for religious_word in religious_vocab:
    if religious_word in words:
        res.append(words.index(religious_word))
#res = [words.index(religious_word) for religious_word in religious_vocab]

#print('Displaying the top 10 words per topic and their probabilities within the topic...')
#print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for i in res:
        print(words[i], ':', topic_word_distributions[topic_idx, i])
    print()

[Topic 0]
hindu : 2.2126267961109092e-06
muslim : 2.2126632504544267e-06
sikh : 2.212680231943285e-06
temple : 0.0035887591966975636
ayodhya : 2.2126736874920206e-06

[Topic 1]
hindu : 2.2161415766330643e-06
muslim : 0.004624529335485363
sikh : 2.2161928735832024e-06
temple : 2.216194719511279e-06
ayodhya : 2.216174563435766e-06

[Topic 2]
hindu : 2.2083110791637973e-06
muslim : 2.208398920276732e-06
sikh : 2.208434508756428e-06
temple : 2.20835559935458e-06
ayodhya : 2.2083081329043313e-06

[Topic 3]
hindu : 2.3340599834284005e-06
muslim : 2.334167342718934e-06
sikh : 2.3340928214179743e-06
temple : 2.334053802147635e-06
ayodhya : 2.334051079747267e-06

[Topic 4]
hindu : 0.005796666027080499
muslim : 2.3100642700551904e-06
sikh : 2.3100408635802277e-06
temple : 2.3100521151896305e-06
ayodhya : 0.0045679768233735825

[Topic 5]
hindu : 2.2040054997802598e-06
muslim : 2.2040872018022866e-06
sikh : 2.2040642514366356e-06
temple : 2.204035097352963e-06
ayodhya : 2.204007464315639e-06

[Top

In [5]:
li = []

for file in glob.glob('./india_headlines_data_pre/religious*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
religious_headlines_pre = pd.concat(li, axis = 0, ignore_index = True)

li = []

for file in glob.glob('./india_headlines_data_post/religious*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
religious_headlines_post = pd.concat(li, axis = 0, ignore_index = True)

In [10]:
#Process religious headlines
processed_religious_headlines_pre = []

for str_object in list(religious_headlines_pre["headline_text"]):
    processed_religious_headlines_pre.append(makeWordList(str_object))

In [11]:
#Use TfidfVectorizer to transform religious headlines
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=100, stop_words="english", max_df=0.8)
rel_headlines_pre_fit = vectorizer.fit(processed_religious_headlines_pre)
X_rel_headlines_pre = vectorizer.fit_transform(processed_religious_headlines_pre).toarray()

In [12]:
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_rel_headlines_pre = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_rel_headlines_pre.fit(X_rel_headlines_pre)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [14]:
words = list(rel_headlines_pre_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_rel_headlines_pre.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
gets : 0.3325878908343633
vhp : 0.1410741139730405
land : 0.09764653418812733
rain : 0.07304272493430007
women : 0.06716153659901956
polls : 0.06155488657925728
high : 0.06152229777309942
churches : 0.04688881030527306
hindus : 0.04190856687229721
offer : 0.04053579900361241

[Topic 1]
monk : 0.09765610426785298
babri : 0.08782965075859098
gujarat : 0.08167533787612587
centre : 0.07756836684415125
rss : 0.0736208801311144
day : 0.07045636329665927
pakistan : 0.06295241149101867
body : 0.05652299472458706
modi : 0.056406034055761056
today : 0.053136787481863235

[Topic 2]
body : 0.24650077884660038
leader : 0.19598466160003897
demand : 0.07515014473444322
prayer : 0.07069654233678867
court : 0.06340698526049049
set : 0.0614349715701121
christian : 0.05403263248965024
new : 0.044979946185935954
party : 0.038278258138742534
state : 0.03787685008210023

[Topic 3]
asks : 0.31918242596499513
meet : 0