In [None]:
#Import packages
import pandas as pd
import numpy as np
import glob
import re
import string
import codecs
import spacy
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from dateutil.parser import parse
from datetime import datetime
from matplotlib_venn import venn2

In [None]:
#Code borrowed and adapted from George Chen, Carnegie Mellon University#
#Define function to remove punctuation and whitespace, and lowercase all text
def makeWordList(str_object):
    
    corpus_text = str(str_object)
    
    for c in string.punctuation:
        corpus_text = corpus_text.replace(c, "")  # -- (1)
    
    text = re.sub(r'\S*\d\S*','',corpus_text) # -- (2)
    text = re.sub(r'[^\w\s]','',text)         # -- (3)
    
    text = text.lower().split()           # -- (4)         
    
    li = []
    for token in text:
        li.append(token)

    return " ".join(li)

# Data Pre-Processing

In [None]:
#Create single parliament_qs dataframe with all question data
li = []

for file in glob.glob('./Parliament_Qs/rajyasabha_questions_and_answers_*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
parliament_qs = pd.concat(li, axis = 0, ignore_index = True)

In [None]:
#Process parliamentary answers
processed_answers = []

for str_object in list(parliament_qs["answer"]):
    processed_answers.append(makeWordList(str_object))

In [None]:
#Import headlines dataset
li = []

for file in glob.glob('./india_headlines_data/*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
headlines_raw = pd.concat(li, axis = 0, ignore_index = True)

In [None]:
#Process headlines, delete headlines object, sample 10% of processed headlines
import random
processed_headlines = []
random.seed(42)
headlines = random.sample(list(headlines_raw["headline_text"]), round(len(headlines_raw)/10))
                          
for str_object in headlines:
    processed_headlines.append(makeWordList(str_object))

In [None]:
#Import religious headline pre- and post-election datasets
li = []

for file in glob.glob('./india_headlines_data_pre/religious*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
religious_headlines_pre = pd.concat(li, axis = 0, ignore_index = True)

li = []

for file in glob.glob('./india_headlines_data_post/religious*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
religious_headlines_post = pd.concat(li, axis = 0, ignore_index = True)

In [None]:
#Process religious headlines
processed_religious_headlines_pre = []

for str_object in list(religious_headlines_pre["headline_text"]):
    processed_religious_headlines_pre.append(makeWordList(str_object))
    
processed_religious_headlines_post = []

for str_object in list(religious_headlines_post["headline_text"]):
    processed_religious_headlines_post.append(makeWordList(str_object))

# Dataset Description and Exploration

In [None]:
#Process Parliament Q&A dataset, aggregating by month and producing a plot of the date distribution
answer_dates = pd.DataFrame(pd.to_datetime(parliament_qs["answer_date"]))
final_answer_dates = []
for date in answer_dates["answer_date"]:
    if isinstance(date, datetime.date):
        final_answer_dates.append([date, 1])
final_answer_dates = pd.DataFrame(final_answer_dates)
#final_answer_dates.reset_index().set_index(0)

plt.figure(figsize=(20,10))
agg = final_answer_dates.resample('M', on=0).count()
plt.plot(agg, c='blue')
election = parse("2014-05-01")
plt.axvline(x=election, c='red')
plt.title("Monthly Parliamentary Questions", size=22)
plt.xlabel("Time", size = 18)
plt.ylabel("Quantity", size = 18)

In [None]:
#Process headlines dataset, aggregating by month and producing datetime plot
headline_dates = pd.DataFrame(pd.to_datetime(headlines_raw["publish_date"]))
final_headline_dates = []
for date in list(headlines_raw["publish_date"]):
    datetime = datetime.strptime(str(date), "%Y%m%d")
    final_headline_dates.append(datetime)
final_headline_dates = pd.DataFrame(final_headline_dates)
#plot
plt.figure(figsize=(12,6))
agg = final_headline_dates.resample('M', on=0).count()
plt.plot(agg, c='blue')
election = parse("2014-05-01")
plt.axvline(x=election, c='red')
plt.title("Monthly Headlines Published", size=22)
plt.xlabel("Time", size = 18)
plt.ylabel("Quantity", size = 18)

# Sentiment Analysis on Religion Datasets

# Topic Modeling - Religious Headlines

In [None]:
#Use TfidfVectorizer to transform religious headlines
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_pre = TfidfVectorizer(min_df=100, stop_words="english", max_df=0.8)
rel_headlines_pre_fit = vectorizer_pre.fit(processed_religious_headlines_pre)
X_rel_headlines_pre = vectorizer.fit_transform(processed_religious_headlines_pre).toarray()

vectorizer_post = TfidfVectorizer(min_df=100, stop_words="english", max_df=0.8)
rel_headlines_post_fit = vectorizer_post.fit(processed_religious_headlines_post)
X_rel_headlines_post = vectorizer_post.fit_transform(processed_religious_headlines_post).toarray()

In [None]:
#Generate seperate topic models for pre and post
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_rel_headlines_pre = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_rel_headlines_pre.fit(X_rel_headlines_pre)

lda_rel_headlines_post = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_rel_headlines_post.fit(X_rel_headlines_post)

In [None]:
#Display top words for topics for pre- and post-election topics
words = list(rel_headlines_pre_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_rel_headlines_pre.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic for pre-election relgious headlines')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

words = list(rel_headlines_post_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_rel_headlines_post.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic for post-election relgious headlines')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

# Topic Modeling - Headlines

In [None]:
#Use TfidfVectorizer to transform headlines
##Memory intensive##
from sklearn.feature_extraction.text import TfidfVectorizer
headline_vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
headlines_fit = headline_vectorizer.fit(processed_headlines)
X_headlines = headline_vectorizer.fit_transform(processed_headlines).toarray()

In [None]:
#Generate 10 topics for headlines
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_headlines = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_headlines.fit(X_headlines)

In [None]:
#View top 20 words for each topic in order to characterize
words = list(headlines_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_headlines.components_])
num_top_words = 20

print('Displaying the top 20 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

# Applying Headlines Topics to Parliament Data

In [None]:
#Remove words in parliament data that don't exist within headlines data
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])
nlp.max_length = 10000000
processed_answers_for_headline_topics = []
out_of_topics_mask = []
in_topics_mask = []
for answer in np.array(parliament_qs["answer"]):
    parsed = nlp(str(answer))
    processed_answer_list = []
    out_of_topics_count = 0
    in_topics_count = 0
    for token in parsed:
        if re.match('[a-zA-Z]+$', token.orth_):
            token_lemma = token.lemma_.lower()
            if token_lemma in headlines_fit.vocabulary_:
                processed_answer_list.append(token_lemma)
                in_topics_count += 1
            else:
                out_of_topics_count += 1
    processed_answer_str = " ".join(processed_answer_list)
    processed_answers_for_headline_topics.append(processed_answer_str)
    out_of_topics_mask.append(out_of_topics_count)
    in_topics_mask.append(in_topics_count)

In [None]:
#Transform the parliamentary answers using the vectorizer used for the headlines
X_answers_for_headline_topics = headline_vectorizer.transform(processed_answers_for_headline_topics[:10000]).toarray()

In [None]:
#Generate topic distributions for the headlines to the parliamentary answers
answers_distribution_of_headline_topics = lda_headlines.transform(X_answers_for_headline_topics)

In [None]:
#Sum amount explained per topic, normalize, and plot
sum_explained_per_topic = []
for i in range(num_topics):
    sum_explained_per_topic.append(sum(answers_distribution_of_headline_topics[:,i]))
per_explained_per_topic = []
for i in range(num_topics):
    x = sum_explained_per_topic[i]/sum(sum_explained_per_topic)
    per_explained_per_topic.append(x)
#plot
plt.bar(range(10), per_explained_per_topic, facecolor='blue', alpha=0.5)
plt.xlabel('Topic')
plt.ylabel('Percent Explained')
labs = ('Growth', 'Crisis', "Local Gov't", 'Investment', 'Crime', 'Development', 'Financial', '', 'Infrastructure', 'Politics')
plt.xticks(np.arange(10), labs, color='orange', rotation=60, fontweight='bold', fontsize='17', horizontalalignment='right')
plt.title('Amount of Parliamentary Answers Explained by Each Headline Topic')
plt.show()

In [None]:
#Visualize the amount of words unique to each dataset and shared among the datasets
in_both = 0
in_answers_only = 0
in_headlines_only = 0
for word in answers_fit.vocabulary_:
    if word in headlines_fit.vocabulary_:
        in_both += 1
    else:
        in_answers_only += 1
for word in headlines_fit.vocabulary_:
    if word not in answers_fit.vocabulary_:
        in_headlines_only += 1
print(in_answers_only, in_both, in_headlines_only)
venn2(subsets = (in_answers_only, in_headlines_only, in_both), set_labels = ("Unique Words in Answers", "Unique Words in Headlines"),
      set_colors=('purple', 'skyblue'), alpha = 0.7);

In [None]:
#Plot topic distribution for the headlines dataset itself to establish baseline
X_headlines_sample = X_headlines[np.random.choice(X_headlines.shape[0], 10000, replace=False)]
headline_topic_distribution = lda_headlines.transform(X_headlines_sample)

sum_explained_per_topic = []
for i in range(num_topics):
    sum_explained_per_topic.append(sum(headline_topic_distribution[:,i]))
per_explained_per_topic = []
for i in range(num_topics):
    x = sum_explained_per_topic[i]/sum(sum_explained_per_topic)
    per_explained_per_topic.append(x)

plt.bar(range(10), per_explained_per_topic, facecolor='blue', alpha=0.5)
plt.xlabel('Topic')
plt.ylabel('Percent Explained')
plt.title('Headline Topics')
plt.show()