In [1]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import os
from gensim import corpora

In [2]:
with open('languages.txt', 'r') as file:
    languages = [f.strip('\n') for f in file.readlines()]

## CORPUSES

#### READ COMMON ARTICLES IN ALL LANGUAGES

In [3]:
def read_text(datatype, folder, filename):
    with open(os.path.join(datatype, folder, filename), 'r') as file:
        return file.read()

In [4]:
files = {lan: os.listdir(os.path.join('page_text', lan)) for lan in languages}

In [5]:
# plain texts
texts = {lan: {f[:-4]: read_text('page_text', lan, f).lower() 
               for f in files[lan]} 
         for lan in languages}

#### CREATE TOKENS

In [6]:
# tokenized text - remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
texts_split = {lan: {key: tokenizer.tokenize(text) for key, text in texts[lan].items()} for lan in languages}

#### JOIN TEXT BY LANGUAGE

In [7]:
text_bylang = {lan: sum([val for key, val in texts_split[lan].items()], []) for lan in languages}

In [8]:
def get_wordcount_by_lang(text_dict, unique=False):
    if unique:
        dictionaries = {lan: corpora.Dictionary([text_dict[lan]]) for lan in text_dict.keys()}
        return {k: len(v) for k, v in dictionaries.items()}
    else:
        return {k: len(v) for k, v in text_dict.items()}

In [9]:
get_wordcount_by_lang(text_bylang)

{'en': 2436938, 'de': 1349958, 'hu': 673618, 'ro': 452312}

In [10]:
get_wordcount_by_lang(text_bylang, True)

{'en': 80349, 'de': 118807, 'hu': 97963, 'ro': 50623}

# TABLE 2

No proc, stopword, stemming, both

## STOPWORD REMOVAL

In [11]:
languages_long = {'en': 'english', 'de': 'german', 'hu': 'hungarian', 'ro': 'romanian'}

In [12]:
stopwords_bylang = {lan: set(stopwords.words(languages_long[lan])) for lan in languages}

In [13]:
text_bylang_stop = {lan: [f for f in text_bylang[lan] if not f in stopwords_bylang[lan]] for lan in languages}

In [14]:
get_wordcount_by_lang(text_bylang_stop)

{'en': 1496282, 'de': 878311, 'hu': 500795, 'ro': 298643}

In [15]:
get_wordcount_by_lang(text_bylang_stop, True)

{'en': 80198, 'de': 118584, 'hu': 97775, 'ro': 50344}

## STEMMING

In [16]:
stemmers = {lan: SnowballStemmer(languages_long[lan]) for lan in languages}

In [17]:
text_bylang_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang[lan]] for lan in languages}

In [18]:
text_bylang_stop_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang_stop[lan]] for lan in languages}

In [19]:
get_wordcount_by_lang(text_bylang_stemmed, True)

{'en': 61948, 'de': 90061, 'hu': 60152, 'ro': 32815}

In [20]:
get_wordcount_by_lang(text_bylang_stop_stemmed, True)

{'en': 61848, 'de': 89993, 'hu': 60116, 'ro': 32647}

# TABLE 3

Sparse removal (1,5,10, 20%)

## CREATE STEMMED, STOPWORD REMOVED CORPUS BY DOC

In [None]:
texts_bylang_byhuman = {lan:
                        {key:
                         [stemmers[lan].stem(word) 
                          for word in val if not word in stopwords_bylang[lan]] 
                         for key, val in texts_split[lan].items()} 
                        for lan in languages}

In [None]:
dictionary_bylang_byhuman = {lan: corpora.Dictionary(texts_bylang_byhuman[lan].values()) for lan in languages}