In [1]:
import pandas as pd
from lib.data_utils import fetch_and_prepare_data_all

try:
    data_df = pd.read_csv('data_samples/askme-full.csv')
    print('Dataset already exists.')
except FileNotFoundError:
    data_df = await fetch_and_prepare_data_all()
    data_df.to_csv('data_samples/askme-full.csv', index=False)
data_df

Unnamed: 0,_sa_instance_state_question,text_question,paragraph_id,scope,timestamp,downvote,filtered,is_answerable_ic,processed_question,id_question,...,question_id,author_id_answer,processed,_sa_instance_state_rating,text_rating,value,author_id_rating,id_rating,answer_id,timestamp_rating
0,<sqlalchemy.orm.state.InstanceState object at ...,How many passengers can the New Flyer DE60LFA ...,1,single-paragraph,2024-06-26 00:46:58,0,True,True,True,1,...,1,2,True,<sqlalchemy.orm.state.InstanceState object at ...,The answer accurately states the total passeng...,5,3,2,2,2024-06-26 03:56:37
1,<sqlalchemy.orm.state.InstanceState object at ...,What feature does the front door of the Swift ...,1,single-paragraph,2024-06-26 00:46:58,0,True,True,True,2,...,2,2,True,<sqlalchemy.orm.state.InstanceState object at ...,The answer accurately states that the front do...,5,3,4,4,2024-06-26 03:57:23
2,<sqlalchemy.orm.state.InstanceState object at ...,Where are the interior bike racks located on t...,1,single-paragraph,2024-06-26 00:46:58,0,True,True,True,3,...,3,2,True,<sqlalchemy.orm.state.InstanceState object at ...,This answer is fully accurate and detailed. It...,5,3,6,6,2024-06-26 03:57:30
3,<sqlalchemy.orm.state.InstanceState object at ...,Where do the Swift Bus Rapid Transit coaches g...,1,single-paragraph,2024-06-26 00:46:58,0,True,True,True,4,...,4,2,True,<sqlalchemy.orm.state.InstanceState object at ...,The answer provided is fully accurate and deta...,5,3,8,8,2024-06-26 03:57:18
4,<sqlalchemy.orm.state.InstanceState object at ...,Which university's sinking was featured in a l...,3,single-paragraph,2024-06-26 00:46:56,0,True,True,True,5,...,5,2,True,<sqlalchemy.orm.state.InstanceState object at ...,"The answer ""Cambridge University's"" is fully a...",5,3,10,10,2024-06-26 03:57:34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39825,<sqlalchemy.orm.state.InstanceState object at ...,Why was concrete chosen as the material for th...,9999,single-paragraph,2024-06-26 01:03:58,0,True,True,True,39826,...,39826,2,True,<sqlalchemy.orm.state.InstanceState object at ...,The answer provided accurately states that con...,5,3,79652,79652,2024-06-26 07:08:06
39826,<sqlalchemy.orm.state.InstanceState object at ...,Where did Don Dunstan move to in 1972 after se...,10000,single-paragraph,2024-06-26 01:04:23,0,True,True,True,39827,...,39827,2,True,<sqlalchemy.orm.state.InstanceState object at ...,This answer is fully accurate and detailed. It...,5,3,79654,79654,2024-06-26 07:08:05
39827,<sqlalchemy.orm.state.InstanceState object at ...,What happened to the family home after Don Dun...,10000,single-paragraph,2024-06-26 01:04:23,0,True,True,True,39828,...,39828,2,True,<sqlalchemy.orm.state.InstanceState object at ...,"The answer ""The family home was sold"" is mostl...",4,3,79656,79656,2024-06-26 07:08:07
39828,<sqlalchemy.orm.state.InstanceState object at ...,In what year did Don Dunstan's divorce from hi...,10000,single-paragraph,2024-06-26 01:04:23,0,True,True,True,39829,...,39829,2,True,<sqlalchemy.orm.state.InstanceState object at ...,This answer is fully accurate and detailed. It...,5,3,79658,79658,2024-06-26 07:08:09


In [7]:
# Average context length per question
data_df['context_length'] = data_df['context'].apply(lambda x: len(x.split())) + data_df['text_paragraph'].apply(lambda x: len(x.split()))
average_context_length = data_df['context_length'].mean()
print(f"Average context length per question: {average_context_length}")

# std deviation of context length
std_dev_context_length = data_df['context_length'].std()
print(f"Standard deviation of context length: {std_dev_context_length}")

# Total context length for retrieval systems
total_context_length = data_df['context_length'].sum()
print(f"Total context length: {total_context_length}")

# Number of unique contexts
num_unique_contexts = data_df['context'].nunique()
print(f"Number of unique contexts: {num_unique_contexts}")

data_df['question_length'] = data_df['text_question'].apply(lambda x: len(x.split()))
average_question_length = data_df['question_length'].mean()
std_question_length = data_df['question_length'].std()
print(f"Average question length: {average_question_length}")
print(f"Standard deviation of question length: {std_question_length}")

# Answer length stats
data_df['answer_length'] = data_df['text'].apply(lambda x: len(x.split()) if isinstance(x, str) else len(str(x).split()))
average_answer_length = data_df['answer_length'].mean()
std_answer_length = data_df['answer_length'].std()
print(f"Average answer length: {average_answer_length}")
print(f"Standard deviation of answer length: {std_answer_length}")

Average context length per question: 125.3003109740674
Standard deviation of context length: 74.37142788610177
Total context length: 143160492
Number of unique contexts: 209538
Average question length: 17.203674447874427
Standard deviation of question length: 5.166980776741944
Average answer length: 23.969771710199826
Standard deviation of answer length: 22.620717456073777


In [None]:
import plotly.express as px

data_df['first_word'] = data_df['text_question'].apply(lambda x: x.split()[0])
data_df['second_word'] = data_df['text_question'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else '')
data_df['third_word'] = data_df['text_question'].apply(lambda x: x.split()[2] if len(x.split()) > 2 else '')

naive_clusters = data_df.groupby(['first_word', 'second_word', 'third_word']).size().reset_index(name='frequency')

fig = px.sunburst(
    naive_clusters,
    path=['first_word', 'second_word', 'third_word'],
    values='frequency',
    title='Naive Clustering of Questions by First Three Words',
)

fig.show()

In [None]:
import nltk
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

nltk.download('punkt')

def get_top_ngrams(questions, n, top_k=32):
    ngram_counter = Counter()
    for question in questions:
        tokens = nltk.word_tokenize(question.lower())
        ngram_counter.update(ngrams(tokens, n))
    return ngram_counter.most_common(top_k)

questions = df['Question'].tolist()

df_ngram = pd.DataFrame(columns=[f'rank-{i+1}' for i in range(32)], index=[f'{i+1}-gram' for i in range(8)])

for n in tqdm(range(1, 9), desc="Processing n-grams"):
    top_ngrams = get_top_ngrams(questions, n)
    for rank, (ngram, freq) in enumerate(top_ngrams):
        ngram_str = ' '.join(ngram)
        df_ngram.at[f'{n}-gram', f'rank-{rank+1}'] = (ngram_str, freq)
    print(f"Top {n}-grams: {top_ngrams[:5]}")
    df_ngram.to_csv('top_ngrams.csv', index=True)