In [23]:
import pandas as pd
import numpy as np
import re
import glob
import spacy
from datetime import datetime
from dateutil.parser import parse

In [9]:
#Read in headlines dataset
data = pd.read_csv("india-news-headlines.csv")

In [4]:
#Split dataset for upload to GitHub
india_news_headlines_1 = data.iloc[0:593984, :]
india_news_headlines_2 = data.iloc[593984:1187969, :]
india_news_headlines_3 = data.iloc[1187969:1781953, :]
india_news_headlines_4 = data.iloc[1781953:2375938, :]
india_news_headlines_5 = data.iloc[2375938:2969922, :]

In [5]:
#Write split data to csv files for upload
india_news_headlines_1.to_csv("india-news-headlines-1.csv", index=False)
india_news_headlines_2.to_csv("india-news-headlines-2.csv", index=False)
india_news_headlines_3.to_csv("india-news-headlines-3.csv", index=False)
india_news_headlines_4.to_csv("india-news-headlines-4.csv", index=False)
india_news_headlines_5.to_csv("india-news-headlines-5.csv", index=False)

In [6]:
#define religious vocabulary for creating religion-related datasets
religious_vocab = ['religion', 'religious', 'hindu', 'hinduism',
                  'islam', 'muslim', 'christianity', 'christian', 'sikh',
                  'sikhism', 'temple', 'mosque', 'church', 'divine', 'god', 'gods',
                  'prayer', 'prayers', 'priest', 'clergy', 'imam', 'monk', 'dharma',
                  'vedas', 'worship', 'worshippers', 'worshipers' 'worshipper', 'worshiper', 'ayodhya',
                   'babri', 'hindutva','lynching','ethnic', 'purity','nationalism', 'nationalist',
                   'rss', 'sangh'
                  ]

In [32]:
#Split headline data on 2014 election into pre and post sets
data_pre = data[data['publish_date'] <= 20140501]
data_post = data[data["publish_date"] > 20140501]

In [55]:
#Generate religious flags for pre- and post-election datasets
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])
nlp.max_length = 10000000
pre_mask = []
for headline in np.array(data_pre["headline_text"]):
    parsed = nlp(headline)
    religious_flag = False
    for token in parsed:
        if re.match('[a-zA-Z]+$', token.orth_):
            token_lemma = token.lemma_.lower()
            if token_lemma in religious_vocab:
                religious_flag = True
    pre_mask.append(religious_flag)
        
post_mask = []
for headline in np.array(data_post["headline_text"]):
    parsed = nlp(headline)
    religious_flag = False
    for token in parsed:
        if re.match('[a-zA-Z]+$', token.orth_):
            token_lemma = token.lemma_.lower()
            if token_lemma in religious_vocab:
                religious_flag = True
    post_mask.append(religious_flag)

In [58]:
#Generate religious datasets
pre = pd.Series(pre_mask)
religious_pre = data_pre[pre.values]
post = pd.Series(post_mask)
religious_post = data_post[post.values]

In [59]:
#Write religious datasets to csv files
religious_pre.to_csv("religious_headlines_pre.csv", index=False)
religious_post.to_csv("religious_headlines_post.csv", index=False)

In [4]:
#Import parliamentary Q&A dataset
li = []

for file in glob.glob('./india_analysis_UDA/Parliament_Qs/rajyasabha_questions_and_answers_*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
parliament_qs = pd.concat(li, axis = 0, ignore_index = True)

In [16]:
#Apply religion filter to parliamentary dataset
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])
nlp.max_length = 10000000
answer_mask = []
for answer in np.array(parliament_qs["answer"]):
    parsed = nlp(str(answer))
    religious_flag = False
    for token in parsed:
        if re.match('[a-zA-Z]+$', token.orth_):
            token_lemma = token.lemma_.lower()
            if token_lemma in religious_vocab:
                religious_flag = True
    answer_mask.append(religious_flag)

In [18]:
#Generate religious parliamentary answer dataset
answer_mask = pd.Series(answer_mask)
religious_answers = parliament_qs[answer_mask.values]

In [45]:
#Split religious parliamentary dataset based on when the new government came into office
answers_pre_mask = []
for date in religious_answers['answer_date'].to_numpy():
    if parse(date) < parse('2014.05.26'):
        answers_pre_mask.append(True)
    else:
        answers_pre_mask.append(False)
answers_pre_mask = pd.Series(answers_pre_mask)
religious_answers_pre = religious_answers[answers_pre_mask.values]
religious_answers_post = religious_answers[~answers_pre_mask.values]
religious_answers_pre.to_csv("religious_answers_pre.csv", index=False)
religious_answers_post.to_csv("religious_answers_post.csv", index=False)