In [26]:
import pandas as pd

dataset1 = pd.read_csv('main.csv').rename(columns={'ground_truth':'answer'})[['question', 'answer']].dropna()
dataset2 = pd.read_csv('dev.csv').rename(columns={'ground_truth':'answer'})[['question', 'answer']].dropna()

# Determine the smaller dataset and its length
if len(dataset1) < len(dataset2):
    dataset2 = dataset2.head(len(dataset1))
else:
    dataset2 = dataset2.head(len(dataset2))


In [27]:
import string
from collections import Counter
from scipy.stats import ks_2samp, pearsonr

def preprocess_text(text):
    """
    Preprocess the text data.
    """
    # Remove punctuation and convert to lowercase
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator).lower()
    return text

def statistical_similarity(data):
    """
    Calculate statistical properties of the dataset.
    """
    question_lengths = [len(preprocess_text(q)) for q in data['question']]
    answer_lengths = [len(preprocess_text(a)) for a in data['answer']]
    
    q_mean, q_std = np.mean(question_lengths), np.std(question_lengths)
    a_mean, a_std = np.mean(answer_lengths), np.std(answer_lengths)
    
    return (q_mean, q_std), (a_mean, a_std)

def ks_test(data1, data2):
    """
    Perform the Kolmogorov-Smirnov test to compare the distributions.
    """
    q_lengths1 = [len(preprocess_text(q)) for q in data1['question']]
    q_lengths2 = [len(preprocess_text(q)) for q in data2['question']]
    a_lengths1 = [len(preprocess_text(a)) for a in data1['answer']]
    a_lengths2 = [len(preprocess_text(a)) for a in data2['answer']]
    
    q_statistic, q_pvalue = ks_2samp(q_lengths1, q_lengths2)
    a_statistic, a_pvalue = ks_2samp(a_lengths1, a_lengths2)
    
    return (q_statistic, q_pvalue), (a_statistic, a_pvalue)

def correlation_coefficient(data1, data2):
    """
    Calculate Pearson's correlation coefficient.
    """
    q_lengths1 = [len(preprocess_text(q)) for q in data1['question']]
    q_lengths2 = [len(preprocess_text(q)) for q in data2['question']]
    a_lengths1 = [len(preprocess_text(a)) for a in data1['answer']]
    a_lengths2 = [len(preprocess_text(a)) for a in data2['answer']]
    
    q_corr_coef, q_pvalue = pearsonr(q_lengths1, q_lengths2)
    a_corr_coef, a_pvalue = pearsonr(a_lengths1, a_lengths2)
    
    return (q_corr_coef, q_pvalue), (a_corr_coef, a_pvalue)

In [35]:
# Calculate metrics
stat_sim1, stat_sim2 = statistical_similarity(dataset1), statistical_similarity(dataset2)
ks_stat1, ks_stat2 = ks_test(dataset1, dataset2)
corr_coef1, corr_coef2 = correlation_coefficient(dataset1, dataset2)

# Print results
print("Dataset 1 Metrics:")
print(f"Statistical Similarity (Question, Answer): {stat_sim1}")
print(f"KS Test Statistic (Question, Answer): {ks_stat1}")
print(f"Correlation Coefficient (Question, Answer): {corr_coef1}")

print("\nDataset 2 Metrics:")
print(f"Statistical Similarity (Question, Answer): {stat_sim2}")
print(f"KS Test Statistic (Question, Answer): {ks_stat2}")
print(f"Correlation Coefficient (Question, Answer): {corr_coef2}")

Dataset 1 Metrics:
Statistical Similarity (Question, Answer): ((147.66666666666666, 60.266814159111554), (234.66666666666666, 118.67022466576114))
KS Test Statistic (Question, Answer): (0.4, 0.18441617684449832)
Correlation Coefficient (Question, Answer): (0.14762837910715754, 0.5995474589012386)

Dataset 2 Metrics:
Statistical Similarity (Question, Answer): ((113.33333333333333, 47.616057048950296), (244.26666666666668, 133.77865632786452))
KS Test Statistic (Question, Answer): (0.2, 0.9383310279844598)
Correlation Coefficient (Question, Answer): (-0.17883972483133398, 0.5236509924002763)
