In [22]:
import pandas as pd
import string

# **Reading Data**

In [39]:
df1 = pd.read_csv(r'S08_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df2 = pd.read_csv(r'S09_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df3 = pd.read_csv(r'S10_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df = pd.concat([df1,df2,df3],ignore_index=True)
df.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile,ï»¿ArticleTitle
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4,
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4,
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4,
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4,
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4,


In [40]:
print("Number of records: ",len(df))

Number of records:  3998


# **Data Preprocessing- Cleaning**

In [41]:
df.info()
df = df[['Question','Answer','ArticleFile','DifficultyFromQuestioner','DifficultyFromAnswerer']]
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3998 entries, 0 to 3997
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              3173 non-null   object
 1   Question                  3961 non-null   object
 2   Answer                    3422 non-null   object
 3   DifficultyFromQuestioner  3043 non-null   object
 4   DifficultyFromAnswerer    3418 non-null   object
 5   ArticleFile               3996 non-null   object
 6   ï»¿ArticleTitle           825 non-null    object
dtypes: object(7)
memory usage: 218.8+ KB


Unnamed: 0,Question,Answer,ArticleFile,DifficultyFromQuestioner,DifficultyFromAnswerer
0,Was Abraham Lincoln the sixteenth President of...,yes,S08_set3_a4,easy,easy
1,Was Abraham Lincoln the sixteenth President of...,Yes.,S08_set3_a4,easy,easy
2,Did Lincoln sign the National Banking Act of 1...,yes,S08_set3_a4,easy,medium
3,Did Lincoln sign the National Banking Act of 1...,Yes.,S08_set3_a4,easy,easy
4,Did his mother die of pneumonia?,no,S08_set3_a4,easy,medium


In [42]:
print(f"Number of duplicate questions: {len(df) - df['Question'].nunique()}")

df.drop_duplicates(subset=['Question'],inplace=True)
print("\nNumber of records after removing duplicates: ",len(df))

Number of duplicate questions: 1542

Number of records after removing duplicates:  2457


In [43]:
# Checking null values and removing them
df.isnull().sum()

Question                      1
Answer                      272
ArticleFile                   2
DifficultyFromQuestioner    892
DifficultyFromAnswerer      277
dtype: int64

In [44]:
df.dropna(subset=['Question'],inplace=True)
df.dropna(subset=['Answer'],inplace=True)
df.dropna(subset=['ArticleFile'],inplace=True)
df.dropna(subset=['DifficultyFromQuestioner'],inplace=True)
df.dropna(subset=['DifficultyFromAnswerer'],inplace=True)
df.reset_index(drop=True,inplace=True)
print("Number of records after removing blanks: ",len(df))

Number of records after removing blanks:  1507


In [45]:
def format_column(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation)) # remove punctuations
    return text

df['Answer'] = df.loc[:,'Answer'].apply(lambda x:format_column(x))
df.head()

Unnamed: 0,Question,Answer,ArticleFile,DifficultyFromQuestioner,DifficultyFromAnswerer
0,Was Abraham Lincoln the sixteenth President of...,yes,S08_set3_a4,easy,easy
1,Did Lincoln sign the National Banking Act of 1...,yes,S08_set3_a4,easy,medium
2,Did his mother die of pneumonia?,no,S08_set3_a4,easy,medium
3,How many long was Lincoln's formal education?,18 months,S08_set3_a4,medium,easy
4,When did Lincoln begin his political career?,1832,S08_set3_a4,medium,easy


In [46]:
dfc=df
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1507 entries, 0 to 1506
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Question                  1507 non-null   object
 1   Answer                    1507 non-null   object
 2   ArticleFile               1507 non-null   object
 3   DifficultyFromQuestioner  1507 non-null   object
 4   DifficultyFromAnswerer    1507 non-null   object
dtypes: object(5)
memory usage: 59.0+ KB


# **Topic Modelling**

In [102]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

  and should_run_async(code)


In [103]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

  and should_run_async(code)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [52]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4] #include the token only if it's length is greater than 4
    tokens = [token for token in tokens if token not in en_stop] #include if token is not a stopword
    tokens = [get_lemma(token) for token in tokens] #lematizing each word in the token list
    return tokens
#preparing the text for topic modelling

**Seperating the Questions form the database**

In [56]:
selected_column=df['Question']
output_csv_file='QuestionForTopicDetection.csv'
selected_column.to_csv(output_csv_file, index=False, header=True)

In [109]:
#Run this only to get the lemma for all the Questions
import random
text_data = []
# count=0
with open('QuestionForTopicDetection.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        print(tokens)
        text_data.append(tokens)
        # count+=1
# print(count)

  and should_run_async(code)


['question']
['abraham', 'lincoln', 'sixteenth', 'president', 'unite', 'state']
['lincoln', 'national', 'banking']
['mother', 'pneumonia']
['lincoln', 'formal', 'education']
['lincoln', 'begin', 'political', 'career']
['legal', 'tender', 'establish']
['suggest', 'lincoln', 'beard']
['gettysburg', 'address', 'argue', 'america']
['lincoln', 'breckinridge', 'election']
['abraham', 'lincoln', 'first', 'president', 'unite', 'state']
['lincoln', 'start', 'political', 'career']
['lincoln', 'represent', 'alton', 'sangamon', 'railroad']
['county', 'lincoln']
['lincoln', 'first', 'serve', 'president']
['assassinate', 'lincoln']
['lincoln', 'election']
['general', 'charge', 'battle', 'antietam']
['lincoln', 'issue', 'emancipation', 'proclamation']
['beetle', 'insect']
['beetle', 'found', 'polar', 'region']
['beetle', 'antenna', 'function', 'primarily', 'organs', 'smell']
['three', 'section', 'beetle']
['defense', 'mechanism', 'colour', 'shape', 'deceive', 'potential', 'enemy']
['beetle', 'potato'

In [105]:
#here we are showcasing only for some random questions
import random
text_data = []
with open('QuestionForTopicDetection.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)


['adult', 'duck', 'flier']
['award', 'call', 'congressman', 'congressman']
['early', 'naturalist', 'distinguish', 'leopard', 'panther']
['millard', 'fillmore']
['challenge', 'establish', 'population', 'eurasian', 'otter']
['nickname', 'theodore', 'roosevelt', 'sister']
['grant', 'nothing', 'batte']
['netwon', 'investigate', 'refraction', 'light']
['hassan', 'massoudy', 'master', 'genre']
['butterfly', 'migratory']
['majority', 'loanword', 'korean']
['raise', 'extend', 'sound', 'chest', 'classical']
['english', 'language', 'often', 'speak', 'montreal']


  and should_run_async(code)


In [116]:
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
# Create Dictionary
dictionary = corpora.Dictionary(text_data)
# Create Corpus
texts = text_data
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1)]


  and should_run_async(code)


In [118]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0,
  '0.012*"giant" + 0.012*"panda" + 0.011*"population" + 0.011*"flute" + '
  '0.010*"state" + 0.009*"unite" + 0.009*"president" + 0.007*"james" + '
  '0.007*"first" + 0.006*"speak"'),
 (1,
  '0.016*"james" + 0.013*"monroe" + 0.012*"turtle" + 0.009*"language" + '
  '0.008*"population" + 0.008*"volta" + 0.008*"jakarta" + 0.007*"piano" + '
  '0.007*"penguin" + 0.007*"predator"'),
 (2,
  '0.020*"language" + 0.017*"call" + 0.014*"cymbal" + 0.012*"official" + '
  '0.009*"turtle" + 0.008*"roosevelt" + 0.007*"common" + 0.007*"species" + '
  '0.007*"large" + 0.007*"cello"'),
 (3,
  '0.011*"duck" + 0.008*"bury" + 0.008*"tesla" + 0.008*"volta" + '
  '0.008*"faraday" + 0.007*"french" + 0.007*"vietnamese" + 0.007*"lincoln" + '
  '0.007*"giraffe" + 0.006*"kangaroo"'),
 (4,
  '0.022*"otter" + 0.012*"turtle" + 0.010*"faraday" + 0.010*"found" + '
  '0.009*"tesla" + 0.009*"beetle" + 0.007*"group" + 0.007*"ghana" + '
  '0.007*"octopus" + 0.006*"people"'),
 (5,
  '0.034*"language" + 0.011*"arabic" + 0

In [121]:
import pyLDAvis.gensim
import pickle
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

  and should_run_async(code)


BrokenProcessPool: ignored

In [123]:
import pickle

def is_picklable(obj):
    try:
        pickle.dumps(obj)
        return True
    except pickle.PicklingError:
        return False

print(is_picklable(dictionary))  # True
print(is_picklable(corpus))  # True
print(is_picklable(lda_model))  # True

True
True
True


  and should_run_async(code)
