In [32]:
import pandas as pd
from src.corpus.text_preprocessing import cz_stemming, en_stemming
from src.utils import series_to_arr

In [33]:
def preprocessing_pipeline(word, lang):
    if lang == 'cz':
        stem = cz_stemming([word])[0]
    else:
        stem = en_stemming([word])[0]
    return f'{lang}_{stem}'

In [34]:
def get_relevant_words(lang):
    df = pd.read_csv(f'data/{lang}_tokens.csv')
    series = series_to_arr(df).explode().value_counts()
    return series.loc[series > 5].index.tolist()

In [35]:
preprocessing_pipeline('dodané', 'cz')

'cz_dodan'

In [36]:
path = './MUSE/data/crosslingual/dictionaries/en-cz.0-5000.txt'
df = pd.read_csv(path, sep='\t', header=None, names=['en', 'cz'])
df

Unnamed: 0,en,cz
0,was,bylo
1,was,byl
2,was,byla
3,for,pro
4,for,za
...,...,...
11103,supplied,dodané
11104,supplied,dodáván
11105,optional,nepovinné
11106,optional,volitelné


In [37]:
def handle_translations(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['en', 'cz'])
    relevant_en_words = get_relevant_words('en')
    relevant_cz_words = get_relevant_words('cz')
    df['en'] = df.apply(lambda x: preprocessing_pipeline(x['en'], 'en'), axis=1)
    df['cz'] = df.apply(lambda x: preprocessing_pipeline(x['cz'], 'cz'), axis=1)
    relevant = df[df['en'].isin(relevant_en_words) & df['cz'].isin(relevant_cz_words)]
    relevant.set_index('en').to_csv(path, header=False, sep='\t')    
    return df

In [38]:
paths = ['./MUSE/data/crosslingual/dictionaries/en-cz.0-5000.txt', './MUSE/data/crosslingual/dictionaries/en-cz.5000-6500.txt']
for path in paths:
    handle_translations(path)

In [8]:
relevant_en_words = get_relevant_words('en')
relevant_cz_words = get_relevant_words('cz')

In [9]:
df = pd.read_csv(path, sep='\t', header=None, names=['en', 'cz'])
df['en'] = df.apply(lambda x: preprocessing_pipeline(x['en'], 'en'), axis=1)
df['cz'] = df.apply(lambda x: preprocessing_pipeline(x['cz'], 'cz'), axis=1)

In [10]:
df

Unnamed: 0,en,cz
0,en_en_sprint,cz_cz_sprint
1,en_en_exil,cz_cz_exil
2,en_en_surpri,cz_cz_překvapen
3,en_en_surpri,cz_cz_překvapen
4,en_en_achiev,cz_cz_úspěch
...,...,...
2075,en_en_climb,cz_cz_lezen
2076,en_en_barrel,cz_cz_barel
2077,en_en_barrel,cz_cz_sud
2078,en_en_barrel,cz_cz_hlaveň


In [98]:
relevant = df[df['en'].isin(relevant_en_words) & df['cz'].isin(relevant_cz_words)]

In [31]:
get_relevant_words('en')

['en_de',
 'en_la',
 'en_commiss',
 'en_le',
 'en_european',
 'en_di',
 'en_en',
 'en_eu',
 'en_state',
 'en_member',
 'en_articl',
 'en_regul',
 'en_e',
 'en_die',
 'en_que',
 'en_inform',
 'en_subject',
 'en_der',
 'en_union',
 'en_decis',
 'en_del',
 'en_council',
 'en_provid',
 'en_à',
 'en_un',
 'en_et',
 'en_market',
 'en_public',
 'en_measur',
 'en_product',
 'en_van',
 'en_includ',
 'en_use',
 'en_el',
 'en_il',
 'en_direct',
 'en_may',
 'en_lo',
 'en_author',
 'en_also',
 'en_concern',
 'en_follow',
 'en_applic',
 'en_se',
 'en_per',
 'en_nation',
 'en_case',
 'en_servic',
 'en_propos',
 'en_countri',
 'en_da',
 'en_programm',
 'en_within',
 'en_support',
 'en_parti',
 'en_regard',
 'en_implement',
 'en_accord',
 'en_develop',
 'en_che',
 'en_aid',
 'en_fund',
 'en_activ',
 'en_would',
 'en_requir',
 'en_right',
 'en_al',
 'en_new',
 'en_du',
 'en_und',
 'en_ec',
 'en_area',
 'en_intern',
 'en_agreement',
 'en_oper',
 'en_law',
 'en_eur',
 'en_financi',
 'en_system',
 'en_rela