In [2]:
import pandas as pd
from lib.data_utils import fetch_and_prepare_data

sample_size = 10000
try:
    data_df = pd.read_csv('data_samples/paragraph-questions-10000.csv')
    print('Dataset already exists.')
except FileNotFoundError:
    data_df = await fetch_and_prepare_data(sample_size)
    data_df.to_csv('data_samples/paragraph-questions-10000.csv', index=False)
data_df

Dataset already exists.


Unnamed: 0,_sa_instance_state_question,paragraph_id,scope,text_question,timestamp,downvote,context,id_question,author_id,upvote,...,setting,id_answer,text,_sa_instance_state_rating,text_rating,value,author_id_rating,answer_id,id_rating,timestamp_rating
0,<sqlalchemy.orm.state.InstanceState object at ...,93041,single-paragraph,What did Walter de Coutances legislate against...,2024-05-15 08:01:43,0,"In an article about 'Walter de Coutances', sec...",839068,8,0,...,ic,1737878,"According to the text, Walter de Coutances leg...",<sqlalchemy.orm.state.InstanceState object at ...,This answer is fully accurate and detailed. It...,5,7,1737878,1737685,2024-05-15 08:01:57
1,<sqlalchemy.orm.state.InstanceState object at ...,374349,single-paragraph,What literary movement did Isidore Isou's expe...,2024-05-15 13:16:58,0,"In an article about 'Tristan Tzara', section '...",846903,8,0,...,zs,1755704,Isidore Isou's experiments with sounds and poe...,<sqlalchemy.orm.state.InstanceState object at ...,The provided answer is significantly incorrect...,1,7,1755704,1755512,2024-05-15 13:17:04
2,<sqlalchemy.orm.state.InstanceState object at ...,48488,single-paragraph,What do visitors often request to do at the Al...,2024-05-26 19:24:27,0,"In an article about 'Algonquin Hotel', section...",1078981,8,0,...,zs,2342344,Visitors to the Algonquin Hotel often request ...,<sqlalchemy.orm.state.InstanceState object at ...,"The answer is mostly correct, as it mentions t...",4,7,2342344,2342137,2024-05-26 19:24:35
3,<sqlalchemy.orm.state.InstanceState object at ...,752934,single-paragraph,What distinctive feature can be found on the r...,2024-05-05 02:33:30,0,"In an article about 'Holden Commodore ( VE )',...",315353,8,0,...,ic,629682,The distinctive feature found on the rear of a...,<sqlalchemy.orm.state.InstanceState object at ...,The answer accurately identifies the distincti...,5,7,629682,629522,2024-05-05 02:33:49
4,<sqlalchemy.orm.state.InstanceState object at ...,389712,single-paragraph,What operation was taking place when Lwow was ...,2024-05-09 10:36:44,0,"In an article about 'Hugo Steinhaus', section ...",583055,8,0,...,ic,1181672,Operation Barbarossa.,<sqlalchemy.orm.state.InstanceState object at ...,"The answer ""Operation Barbarossa"" is fully acc...",5,7,1181672,1181491,2024-05-09 10:37:18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,<sqlalchemy.orm.state.InstanceState object at ...,830641,single-paragraph,How many limbs must have severe disabilities t...,2024-05-02 10:47:32,0,In an article about 'Para @-@ snowboarding cla...,149789,8,0,...,ic,298653,"According to the article, an athlete must have...",<sqlalchemy.orm.state.InstanceState object at ...,The answer accurately quotes the original text...,5,7,298653,298508,2024-05-02 10:48:19
9995,<sqlalchemy.orm.state.InstanceState object at ...,28611,single-paragraph,"Who split the atom, inspiring Henry DeWolf Smy...",2024-05-03 11:30:23,0,"In an article about 'Henry DeWolf Smyth', sect...",202607,8,0,...,ic,404216,John Cockcroft and Ernest Walton.,<sqlalchemy.orm.state.InstanceState object at ...,"The answer ""John Cockcroft and Ernest Walton"" ...",5,7,404216,404056,2024-05-03 11:31:00
9996,<sqlalchemy.orm.state.InstanceState object at ...,146321,single-paragraph,What happens when Link swings an enhanced swor...,2024-05-16 06:36:03,0,In an article about 'The Legend of Zelda : Ora...,871764,8,0,...,zs,1812867,"When Link swings an enhanced sword (i.e., a sw...",<sqlalchemy.orm.state.InstanceState object at ...,"The provided answer is mostly correct, includi...",4,7,1812867,1812674,2024-05-16 06:36:12
9997,<sqlalchemy.orm.state.InstanceState object at ...,348059,single-paragraph,What was James McAvoy initially rejected for i...,2024-05-03 23:14:45,0,"In an article about 'Wanted ( 2008 film )', se...",237182,8,0,...,ic,473326,James McAvoy was initially rejected in 2006 be...,<sqlalchemy.orm.state.InstanceState object at ...,This answer is fully accurate and detailed. It...,5,7,473326,473167,2024-05-03 23:14:56


In [3]:
# Average context length per question
data_df['context_length'] = data_df['context'].apply(lambda x: len(x.split())) + data_df['text_paragraph'].apply(lambda x: len(x.split()))
average_context_length = data_df['context_length'].mean()
print(f"Average context length per question: {average_context_length}")

# Total context length for retrieval systems
total_context_length = data_df['context_length'].sum()
print(f"Total context length: {total_context_length}")

Average context length per question: 125.1030103010301
Total context length: 1250905


In [5]:
data_df['question_length'] = data_df['text_question'].apply(lambda x: len(x.split()))
average_question_length = data_df['question_length'].mean()
std_question_length = data_df['question_length'].std()
print(f"Average question length: {average_question_length}")
print(f"Standard deviation of question length: {std_question_length}")

Average question length: 16.826582658265828
Standard deviation of question length: 4.901074733122443


In [7]:
from nltk.corpus import wordnet as wn

# Function to get synsets
def get_synsets(text):
    words = text.split()
    synsets = [wn.synsets(word) for word in words]
    return synsets

data_df['question_synsets'] = data_df['text_question'].apply(get_synsets)

# Example analysis of synsets
sample_synsets = data_df['question_synsets'].head()
print(sample_synsets)

0    [[], [Synset('make.v.01'), Synset('perform.v.0...
1    [[], [Synset('literary.a.01'), Synset('literar...
2    [[], [Synset('bash.n.02'), Synset('do.n.02'), ...
3    [[], [Synset('distinctive.s.01'), Synset('clas...
4    [[], [Synset('operation.n.01'), Synset('operat...
Name: question_synsets, dtype: object


In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc]
    return lemmatized

data_df['lemmatized_question'] = data_df['text_question'].apply(lemmatize_text)

# Dictionary coverage counting
all_words = [word for lemmatized in data_df['lemmatized_question'] for word in lemmatized]
unique_words = set(all_words)
word_count = len(unique_words)
print(f"Total unique words: {word_count}")

Total unique words: 21668


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(data_df['text_question'])

# LDA for topic modeling
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tfidf_matrix)

# Display topics
for index, topic in enumerate(lda.components_):
    print(f"Topic #{index}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Topic #0:
['do', 'is', 'how', 'and', 'in', 'to', 'type', 'what', 'the', 'of']
Topic #1:
['new', 'on', 'was', 'and', 'in', 'did', 'what', 'of', 'to', 'the']
Topic #2:
['by', 'title', 'was', 'album', 'released', 'in', 'what', 'of', 'is', 'the']
Topic #3:
['where', 'which', 'and', 'are', 'is', 'to', 'in', 'what', 'of', 'the']
Topic #4:
['date', 'which', 'year', 'did', 'was', 'on', 'of', 'what', 'in', 'the']
Topic #5:
['wrote', 'to', 'what', 'about', 'how', 'did', 'who', 'of', 'in', 'the']
Topic #6:
['was', 'during', 'and', 'who', 'what', 'to', 'in', 'did', 'of', 'the']
Topic #7:
['role', 'film', 'was', 'and', 'which', 'what', 'who', 'of', 'in', 'the']
Topic #8:
['did', 'was', 'according', 'and', 'in', 'is', 'what', 'of', 'to', 'the']
Topic #9:
['and', 'to', 'what', 'which', 'did', 'for', 'of', 'was', 'in', 'the']


In [12]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# 4-bag-of-words
vectorizer_4 = CountVectorizer(ngram_range=(4, 4))
bow_4 = vectorizer_4.fit_transform(data_df['text_question'])
bow_4_freq = dict(zip(vectorizer_4.get_feature_names_out(), bow_4.sum(axis=0).tolist()[0]))

# 8-bag-of-words
vectorizer_8 = CountVectorizer(ngram_range=(8, 8))
bow_8 = vectorizer_8.fit_transform(data_df['text_question'])
bow_8_freq = dict(zip(vectorizer_8.get_feature_names_out(), bow_8.sum(axis=0).tolist()[0]))

# Find highest frequency top 8
top_8_bow_4 = Counter(bow_4_freq).most_common(32)
top_8_bow_8 = Counter(bow_8_freq).most_common(32)
print("Top 8 4-bag-of-words:", top_8_bow_4)
print("Top 8 8-bag-of-words:", top_8_bow_8)

Top 8 4-bag-of-words: [('in what year did', 197), ('the name of the', 142), ('in what year was', 141), ('the title of the', 133), ('what is the name', 120), ('is the name of', 114), ('what is the title', 112), ('is the title of', 111), ('on what date did', 84), ('what year was the', 73), ('on what date was', 70), ('what year did the', 63), ('in the united states', 59), ('what was the title', 58), ('was the title of', 56), ('what is the primary', 51), ('in relation to the', 49), ('what was the original', 46), ('what was the name', 44), ('in the context of', 43), ('was the name of', 41), ('what date did the', 40), ('and what was the', 39), ('played the role of', 38), ('what is the typical', 37), ('what was the outcome', 37), ('what date was the', 34), ('what was the primary', 33), ('was the outcome of', 31), ('during the battle of', 30), ('in terms of its', 28), ('what happens to the', 27)]
Top 8 8-bag-of-words: [('what is the term used to describe the', 10), ('is the name of the charact

In [13]:
for i, (word, freq) in enumerate(top_8_bow_4):
    print(f"4-bag-of-words #{i}: {word} ({freq})")

4-bag-of-words #0: in what year did (197)
4-bag-of-words #1: the name of the (142)
4-bag-of-words #2: in what year was (141)
4-bag-of-words #3: the title of the (133)
4-bag-of-words #4: what is the name (120)
4-bag-of-words #5: is the name of (114)
4-bag-of-words #6: what is the title (112)
4-bag-of-words #7: is the title of (111)
4-bag-of-words #8: on what date did (84)
4-bag-of-words #9: what year was the (73)
4-bag-of-words #10: on what date was (70)
4-bag-of-words #11: what year did the (63)
4-bag-of-words #12: in the united states (59)
4-bag-of-words #13: what was the title (58)
4-bag-of-words #14: was the title of (56)
4-bag-of-words #15: what is the primary (51)
4-bag-of-words #16: in relation to the (49)
4-bag-of-words #17: what was the original (46)
4-bag-of-words #18: what was the name (44)
4-bag-of-words #19: in the context of (43)
4-bag-of-words #20: was the name of (41)
4-bag-of-words #21: what date did the (40)
4-bag-of-words #22: and what was the (39)
4-bag-of-words #23: