In [1]:
from collections import Counter
import os
import nltk
import pandas as pd

input_dir = '../data/processed'

output_dir = '../data/processed/word_freqs'
os.makedirs(output_dir, exist_ok=True)

BUF_SIZE = 100000
word_limit = 1000000

all_word_counter = Counter()

def save_freqs_flat(counter, output_path):
    with open(output_path, 'w') as f:
        output = '\n'.join(
            [
                word for 
                word, freq in sorted(counter.most_common(word_limit), key=lambda x: x[1], reverse=True)
            ]
        )
        f.write(output)

def save_with_freq(counter, output_path, freq_limit=0):
    data = [
        {
            'word': word,
            'freq': freq
        }
        for word, freq in counter.items()
        if freq > freq_limit
    ]

    df = pd.DataFrame(data)
    df = df.sort_values('freq', ascending=False)
    df.to_csv(
        output_path.replace('.txt', '.csv'),
        index=False
    )


for input_file_name in [
    'oshhamaho.txt',
    'apkbr_ru.txt',
    'elgkbr_ru.txt',
]:
    input_file_path = os.path.join(input_dir, input_file_name)
    file_word_counter = Counter()
    with open(input_file_path) as f:
        tmp_raw = f.read(BUF_SIZE)
        while tmp_raw:
            words = nltk.word_tokenize(tmp_raw)
            
            all_word_counter.update(words)
            file_word_counter.update(words)
            
            tmp_raw = f.read(BUF_SIZE)
    
    freq_out_path = os.path.join(output_dir, f'freq_{word_limit}_{input_file_name}')
    save_freqs_flat(file_word_counter, freq_out_path)
    save_with_freq(file_word_counter, freq_out_path)

all_freq_out_path = os.path.join(output_dir, f'freq_{word_limit}_all.txt')
save_freqs_flat(all_word_counter, all_freq_out_path)
save_with_freq(all_word_counter, all_freq_out_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/panagoa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
