In [1]:
import string
import re
import pandas as pd
import nltk
from nltk.util import ngrams
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

In [3]:
df = pd.read_parquet("../data/wiki-tr.parquet")
df = df.sample(25000,random_state=1)

In [4]:
def clean_wikipedia_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?', '', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.DOTALL)
    text = re.sub(r'<ref.*?>.*?</ref>', '', text, flags=re.DOTALL)
    text = re.sub(r'\[\[.*?\|', '', text)
    text = re.sub(r'\[\[|\]\]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s.]', '', text)
    return text


In [7]:
df_cleaned_text = df.copy()
df_cleaned_text["text"] = df.text.apply(clean_wikipedia_text)
df_cleaned_text.to_parquet("../data/cleaned_data_25000.parquet")

In [11]:
df = pd.read_parquet("../data/cleaned_data_25000.parquet")

In [15]:
def clean_str(text):
    return re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text)

def tokenizer(text):
    return word_tokenize(text)

def convert_lower_case(tokens):
    return [token.lower() for token in tokens]

def clear_stop_words(tokens):
    with open("../lib/turkce-stop-words.txt","r",encoding="utf-8") as f:
        stopwords = f.read().splitlines()
    return [token for token in tokens if token not in stopwords]

def create_ngrams_file(tokens,out_path,n=2):
    ngramss = list(ngrams(tokens, n))
    ngram_freq = Counter(ngramss)
    with open(out_path, 'w', encoding='utf-8') as f:
        for ngram, freq in ngram_freq.items():
            ngram_str = ' '.join(ngram)
            f.write(f"{ngram_str}\t{freq}\n")

def write_cleaned(text):
    out = clean_str(text)
    out = tokenizer(text)
    out = convert_lower_case(text)
    out = clear_stop_words(text)

In [16]:
all_text = ''.join(df['text'])
tokens = tokenizer(all_text)
tokens = convert_lower_case(tokens)
tokens = clear_stop_words(tokens)

In [17]:
create_ngrams_file(tokens,"../lib/unigrams.txt",n=1)

In [18]:
create_ngrams_file(tokens,"../lib/bigrams.txt",n=2)

In [19]:
create_ngrams_file(tokens,"../lib/trigrams.txt",n=3)