In [2]:
#Import packages
import pandas as pd
import numpy as np
import glob
import re
import string
import codecs

In [3]:
#Create single parliament_qs dataframe with all question data
li = []

for file in glob.glob('./Parliament_Qs/rajyasabha_questions_and_answers_*.csv'):
    data = pd.read_csv(file)
    li.append(data)
    
parliament_qs = pd.concat(li, axis = 0, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [4]:
#Code borrowed and adapted from George Chen, Carnegie Mellon University#
#Define function to remove punctuation and whitespace, and lowercase all text
def makeWordList(str_object):
    
    corpus_text = str(str_object)
    
    for c in string.punctuation:
        corpus_text = corpus_text.replace(c, "")  # -- (1)
    
    text = re.sub(r'\S*\d\S*','',corpus_text) # -- (2)
    text = re.sub(r'[^\w\s]','',text)         # -- (3)
    
    text = text.lower().split()           # -- (4)         
    
    li = []
    for token in text:
        li.append(token)

    return " ".join(li)

In [5]:
#Process the questions
processed_questions = []

for str_object in list(parliament_qs["question_description"]):
    processed_questions.append(makeWordList(str_object))

In [6]:
#Process responses
processed_answers = []

for str_object in list(parliament_qs["answer"]):
    processed_answers.append(makeWordList(str_object))

In [8]:
#Use TfidfVectorizer to transform parliamentary questions
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
questions_fit = vectorizer.fit(processed_questions)
X_questions = vectorizer.fit_transform(processed_questions).toarray()

In [9]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda.fit(X_questions)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [10]:
#Display top 10 words from each topic
words = list(questions_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
gap : 0.01579022441204997
infrastructure : 0.014897226390985315
prevention : 0.014573223734675381
bureau : 0.014133207481119538
proportion : 0.012630412616072841
industrial : 0.012042821713099805
hilly : 0.012024388809306048
directed : 0.010113664162308359
partnership : 0.00971378763978077
energy : 0.00963057648174496

[Topic 1]
schools : 0.028377798470822658
value : 0.022452517968475666
mission : 0.015641742441613003
vegetables : 0.014005590929451263
cil : 0.012541512057791986
infrastructure : 0.012243526742358556
attention : 0.011411070447612725
aviation : 0.011187747979636595
completely : 0.011059364577094612
directed : 0.010075847135726523

[Topic 2]
laying : 0.017437642339725355
gap : 0.016036999598692318
interests : 0.015574737920312092
suffering : 0.015011090032775643
amend : 0.014575143537023516
granted : 0.014128134505889766
infrastructure : 0.014042519723800414
speed : 0.0127480814858

In [11]:
#Use TfidfVectorizer to transform parliamentary answers
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=200, stop_words="english", max_df=0.8)
answers_fit = vectorizer.fit(processed_answers)
X_answers = vectorizer.fit_transform(processed_answers).toarray()

In [12]:
#!Time-consuming!#
#Create topics using LDA
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda_answers = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=0)
lda_answers.fit(X_answers)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [13]:
#Display top 10 words from each topic
words = list(answers_fit.vocabulary_)
topic_word_distributions = np.array([row / row.sum() for row in lda_answers.components_])
num_top_words = 10

print('Displaying the top 10 words per topic and their probabilities within the topic...')
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(-topic_word_distributions[topic_idx])
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(words[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
factories : 0.02988721628112838
obcs : 0.02096461145550225
turn : 0.01278346755295074
telecommunication : 0.012244657374213581
tushar : 0.011854834372685976
eia : 0.010469027885389176
brgf : 0.01040769803138051
nadu : 0.010387606802206852
age : 0.010066275250836669
yesso : 0.00893199274924593

[Topic 1]
modes : 0.054054773288766136
adhere : 0.053096678905447885
delays : 0.05243857271473336
output : 0.050170072321687506
pala : 0.027718365872471375
thereunder : 0.024624474760197763
bulk : 0.021081139431236313
load : 0.020522358079513296
currency : 0.020150267589209212
deficiencies : 0.019329537512068926

[Topic 2]
audit : 0.04533189827779601
exercise : 0.02308047410385203
points : 0.022290914541281563
plan : 0.019666558375040173
measure : 0.01692254929341677
doctors : 0.015094389664007071
gurjara : 0.01447741111652817
monitoring : 0.013979563620813908
rules : 0.01294867238909764
widening : 0.0128