In [3]:
import pandas as pd
import os
from typing import List

class QADatasetAnalyzer:
    def __init__(self, file_paths: List[str]):
        self.datasets = {}
        self.column_names = {
            'Bulgarian': ['Question', 'Answer'],
            'Polish': ['Question', 'Answer'],
            'Chosen': ['Question', 'Answer']
        }
        
        for file_path in file_paths:
            base_name = os.path.basename(file_path)
            language = base_name.split('_')[2].split('.')[0].capitalize()
            self.datasets[language] = pd.read_csv(file_path)
            print(f"Loaded {language} dataset: {len(self.datasets[language])} rows")
    
    def language_summary(self):
        summary = {}
        for lang, df in self.datasets.items():
            question_column = self.column_names[lang][0]
            question_lengths = df[question_column].apply(lambda x: len(str(x).split()))
            
            summary[lang] = {
                'Avg Question Length (Words)': question_lengths.mean(),
                'Max Question Length (Words)': question_lengths.max(),
                'Min Question Length (Words)': question_lengths.min()
            }
        
        return pd.DataFrame.from_dict(summary, orient='index')

if __name__ == '__main__':
    file_paths = [
        '../data/input/trivia_qa_bulgarian.csv',
        '../data/input/trivia_qa_polish.csv',
        '../data/input/trivia_qa_chosen.csv'
    ]
    
    analyzer = QADatasetAnalyzer(file_paths)
    
    print("\nQuestion Length Statistics per Language:")
    print(analyzer.language_summary().to_latex())


Loaded Bulgarian dataset: 100 rows
Loaded Polish dataset: 100 rows
Loaded Chosen dataset: 100 rows

Question Length Statistics per Language:
\begin{tabular}{lrrr}
\toprule
 & Avg Question Length (Words) & Max Question Length (Words) & Min Question Length (Words) \\
\midrule
Bulgarian & 10.930000 & 26 & 4 \\
Polish & 9.600000 & 19 & 4 \\
Chosen & 11.740000 & 27 & 5 \\
\bottomrule
\end{tabular}

