# ocr dictionaries for federal sources

In [1]:
# Imports

import collections
import glob
import re

In [2]:
# Constants and Initialization

WORD_COUNT = 50000

DE_PATH, FR_PATH, IT_PATH = 'AS-1948-1998/', 'RO-1948-1998/','RU-1948-1998/'

DE_STOP_WORDS = set(line.strip() for line in open('dictionaries/stopwords-de.txt'))
FR_STOP_WORDS = set(line.strip() for line in open('dictionaries/stopwords-fr.txt'))
IT_STOP_WORDS = set(line.strip() for line in open('dictionaries/stopwords-it.txt'))

DE_FULL=set(re.sub(r' [0-9]+','', line.strip()) for line in open('dictionaries/de_full.txt'))
FR_FULL=set(re.sub(r' [0-9]+','', line.strip()) for line in open('dictionaries/fr_full.txt'))
IT_FULL=set(re.sub(r' [0-9]+','', line.strip()) for line in open('dictionaries/it_full.txt'))

In [3]:
# List all txt file paths

def get_all_files_in_path(path):
    return [f for f in glob.glob(path + "**/*.txt", recursive=True)]

de_files, fr_files, it_files = get_all_files_in_path(DE_PATH), get_all_files_in_path(FR_PATH), get_all_files_in_path(IT_PATH)

In [4]:
# Concatenate all contents

def get_file_content(file_path):
    with open(file_path, 'r') as file:
        return file.read()
    
de_txt = ''.join([get_file_content(file) for file in de_files])
fr_txt = ''.join([get_file_content(file) for file in fr_files])
it_txt = ''.join([get_file_content(file) for file in it_files])

In [5]:
# Tokenize contents

de_tokens = [word for word in re.split(r'\W+', de_txt) if word.isalpha() and len(word) > 3 and word.lower() not in DE_STOP_WORDS]
fr_tokens = [word for word in re.split(r'\W+', fr_txt) if word.isalpha() and len(word) > 3 and word.lower() not in FR_STOP_WORDS]
it_tokens = [word for word in re.split(r'\W+', it_txt) if word.isalpha() and len(word) > 3 and word.lower() not in IT_STOP_WORDS]

In [6]:
# Extract most common words

de_most_common = set([t[0] for t in collections.Counter(de_tokens).most_common(WORD_COUNT)])
fr_most_common = set([t[0] for t in collections.Counter(fr_tokens).most_common(WORD_COUNT)])
it_most_common = set([t[0] for t in collections.Counter(it_tokens).most_common(WORD_COUNT)])

In [7]:
# Substract generally common words

de_diff = [x for x in de_most_common if x.lower() not in DE_FULL]
fr_diff = [x for x in fr_most_common if x.lower() not in FR_FULL]
it_diff = [x for x in it_most_common if x.lower() not in IT_FULL]

In [8]:
# Save dict

with open('de_federal_dic.txt', 'w') as f:
    f.write('\n'.join(de_diff))
    
with open('fr_federal_dic.txt', 'w') as f:
    f.write('\n'.join(fr_diff))
    
with open('it_federal_dic.txt', 'w') as f:
    f.write('\n'.join(it_diff))