In [1]:
from tqdm import tqdm
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucaf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Function to analyze text
def analyze_text(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    num_sentences = len(sentences)
    num_words = len(words)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    return num_words, num_sentences, avg_sentence_length


In [3]:
# Directory paths
dev_dir = 'flores200_dataset/dev'
devtest_dir = 'flores200_dataset/devtest'

# Initialize counters for the entire dataset
total_words = total_sentences = total_texts = 0

# Initialize counters for Maori specifically
maori_words = maori_sentences = maori_texts = 0

# Initialize counters for English specifically
english_words = english_sentences = english_texts = 0

# Analyze all files in the dev and devtest directories
for directory in [dev_dir, devtest_dir]:
    for filename in tqdm(os.listdir(directory), desc=f"Processing files in {directory}"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in tqdm(lines, desc=f"Processing lines in {filename}", leave=False):
                num_words, num_sentences, _ = analyze_text(line)
                total_words += num_words
                total_sentences += num_sentences
                total_texts += 1

                # Check if the file is for Maori
                if 'mri_Latn' in filename:
                    maori_words += num_words
                    maori_sentences += num_sentences
                    maori_texts += 1

                # Check if the file is for English
                if 'eng_Latn' in filename:
                    english_words += num_words
                    english_sentences += num_sentences
                    english_texts += 1


Processing files in flores200_dataset/dev: 100%|██████████| 204/204 [00:20<00:00, 10.02it/s]
Processing files in flores200_dataset/devtest: 100%|██████████| 204/204 [00:21<00:00,  9.40it/s]


In [4]:

# Calculate averages for the entire dataset
avg_words_per_text = total_words / total_texts if total_texts > 0 else 0
avg_sentences_per_text = total_sentences / total_texts if total_texts > 0 else 0

# Calculate averages for Maori
avg_maori_words_per_text = maori_words / maori_texts if maori_texts > 0 else 0
avg_maori_sentences_per_text = maori_sentences / maori_texts if maori_texts > 0 else 0

# Calculate averages for English
avg_english_words_per_text = english_words / english_texts if english_texts > 0 else 0
avg_english_sentences_per_text = english_sentences / english_texts if english_texts > 0 else 0

print(f"Average words per text (entire dataset): {avg_words_per_text:.2f}")
print(f"Average sentences per text (entire dataset): {avg_sentences_per_text:.2f}")
print(f"Average words per text (Maori): {avg_maori_words_per_text:.2f}")
print(f"Average sentences per text (Maori): {avg_maori_sentences_per_text:.2f}")
print(f"Average words per text (English): {avg_english_words_per_text:.2f}")
print(f"Average sentences per text (English): {avg_english_sentences_per_text:.2f}")

Average words per text (entire dataset): 23.18
Average sentences per text (entire dataset): 1.11
Average words per text (Maori): 32.50
Average sentences per text (Maori): 1.12
Average words per text (English): 24.10
Average sentences per text (English): 1.12
