In [None]:
import pandas as pd
import numpy as np

In [None]:
with open("../data/slk_newscrawl_2016_1M/slk_newscrawl_2016_1M-sentences.txt", "r", encoding="utf-8") as f:
    # read lines and split at tab, keep only second column
    slk_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

with open("../data/tur_news_2024_1M/tur_news_2024_1M-sentences.txt", "r", encoding="utf-8") as f:
    # read lines and split at tab, keep only second column
    tur_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

display(slk_df.head())
display(tur_df.head())

In [None]:
# define filtering functions
def punctuation_number_ratio(sentence):
        punctuations = sum(1 for c in sentence if not c.isalnum() and not c.isspace())
        numbers = sum(1 for c in sentence if c.isdigit())
        total_chars = len(sentence)
        if total_chars == 0:
            return 0
        return (punctuations + numbers) / total_chars

def contains_url(sentence):
       return "http://" in sentence or "https://" in sentence or "www." in sentence

def word_count(sentence):
       return len(sentence.split())

In [None]:
# remove noisy sentences
# - punctuation and number ratio > 0.3
# - remove urls in sentences
# - remove sentences with less than 5 words

def filter_sentences(df):
    filtered_df = df[
        (df["sentence"].apply(punctuation_number_ratio) <= 0.3)
        & (~df["sentence"].apply(contains_url))
        & (df["sentence"].apply(word_count) >= 5)
    ]
    return filtered_df.reset_index(drop=True)

In [None]:
filtered_slk_df = filter_sentences(slk_df)
filtered_tur_df = filter_sentences(tur_df)

print(f"Filtered size Slovak: {len(filtered_slk_df)}")
print(f"Filtered size Turkish: {len(filtered_tur_df)}")
display(filtered_slk_df.head())
display(filtered_tur_df.head())

In [None]:
filtered_slk_df.to_csv("../data/preprocessed/slk_filtered.txt", index=False, header=False, sep="\t", encoding="utf-8")
filtered_tur_df.to_csv("../data/preprocessed/tur_filtered.txt", index=False, header=False, sep="\t", encoding="utf-8")

In [None]:
# tokenize filtered sentences using pretrained BPE tokenizer
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("../tokenizers/slk_bpe_tokenizer.json")
filtered_slk_df['tokens'] = filtered_slk_df['sentence'].apply(lambda x: tokenizer.encode(x).tokens)
display(filtered_slk_df.head())
filtered_slk_df.to_csv("../data/preprocessed/slk_tokenized.csv", index=False, encoding="utf-8")

tokenizer = Tokenizer.from_file("../tokenizers/tur_bpe_tokenizer.json")
filtered_tur_df['tokens'] = filtered_tur_df['sentence'].apply(lambda x: tokenizer.encode(x).tokens)
display(filtered_tur_df.head())
filtered_tur_df.to_csv("../data/preprocessed/tur_tokenized.csv", index=False, encoding="utf-8")