File Preprocessing

In [1]:
import os
import codecs
import spacy
import numpy
import gensim
from spacy import displacy
import pandas as pd
import itertools as it

In [2]:
# PICK THE LANGUAGE OF THE CORPUS YOU WISH TO EXAMINE
language = 'de'

In [3]:
%%time
if language == 'fr':
    print('loading french model')
    nlp = spacy.load("fr_core_news_md")
elif language == 'de':
    print('loading german model')
    nlp = spacy.load("de_core_news_md")   


loading german model
CPU times: user 1min 2s, sys: 3.36 s, total: 1min 5s
Wall time: 1min 5s


We open the preprocessed de file

In [167]:
directory = "../corpus_samples/OCRd/ALL_DE/processed/"
filename = directory + language +'.txt'

In [168]:
with codecs.open(filename, encoding='utf_8') as f:
  lines = f.readlines()
  full_text = '\n'.join(lines)
print(full_text)

Protokoll Verhandlung permanent Commission Europäische Gradmessung September WIEN Manuscript drucken Protokoll Verhandlung permanent Commission Europäische Gradmessung September WIEN Manuscript drucken Sitzung permanent Commission Verhandelt Wien September Anfang Sitzung Anwesend Mitglied permanent Commission Herr Fligely Bruhns Forsch Ibanez;. Commissare Herr Barozzi Ganahl Perrier Plantamour Saget Thöt Tinter Vecchi Präsident Herr Fligely Präsident eröffnen Sitzung sprechen Bedauern permanent Commission Mitglied anwesend wodurch beschlussfähig verlesen Schreiben Herrn Baeyer Heidelberg erkranken Schreiben Herrn Bauernfeind Bad ge- brauchen Schreiben Herrn Hirsch ebenfalls Krankheit reisen ausfahren Schreiben Herrn Bauernfeind Einladung Name Königl Bayerische Minister Dr. Lutz nächst Generaleonferenz München abhalten Präsident begrüsst anwesend Herr sprechen Freude Name französisch Regierung Herr Oberst Saget Capitain Perrier Yvon Villarceau gegenwärtig Versammlung herkommen theilt fe

In [169]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [170]:
intermediate_directory = '../corpus_samples/lemmatized/' + language + '/'

def line_page(filename):
    """
    generator function to read in pages from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for page in f:
            yield page.replace('\\r', '\r')

def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse pages,
    lemmatize the text, and yield sentences
    """
    
    for parsed_page in nlp.pipe(line_page(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_page.sents:
            yield u' '.join([token.lemma_ for token in sent])


unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for sentence in lemmatized_sentence_corpus(filename):
        f.write(sentence + '\n')

  "__main__", mod_spec)


In [171]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [172]:
for unigram_sentence in it.islice(unigram_sentences, 0, 240):
    print(u' '.join(unigram_sentence))
    print(u'')

Protokoll

Verhandlung permanent Commission

europäisch Gradmessung September

WIEN

Manuscript drucken Protokoll

Verhandlung permanent Commission

europäisch Gradmessung September

WIEN

Manuscript drucken Sitzung permanent

Commission

Verhandelt Wien

September Anfang Sitzung

Anwesend Mitglied permanent Commission Herr Fligely Bruhns Forsch

Ibanez;. Commissare

Herr Barozzi

Ganahl Perrier

Plantamour Saget Thöt Tinter Vecchi Präsident

Herr Fligely

Präsident eröffnen Sitzung sprechen Bedauern permanent Commission

Mitglied anwesend wodurch beschlussfähig verlesen Schreiben Herrn Baeyer Heidelberg erkranken Schreiben Herrn Bauernfeind Bad ge-

brauchen Schreiben Herrn Hirsch ebenfalls Krankheit reisen ausfahren Schreiben

Herrn Bauernfeind Einladung Name Königl

Bayerische Minister Dr. Lutz nächst Generaleonferenz

München abhalten

Präsident begrüsst anwesend

Herr sprechen Freude

Name französisch

Regierung Herr Oberst Saget Capitain Perrier Yvon Villarceau gegenwärtig Versam

In [173]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model')

In [174]:
bigram_model = Phrases(unigram_sentences)

bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [175]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

In [176]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [177]:
for bigram_sentence in it.islice(bigram_sentences, 0, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

Protokoll

Verhandlung_permanent Commission

europäisch_Gradmessung September

WIEN

Manuscript drucken Protokoll

Verhandlung_permanent Commission

europäisch_Gradmessung September

WIEN

Manuscript drucken Sitzung_permanent

Commission

Verhandelt_Wien

September_Anfang Sitzung

Anwesend Mitglied_permanent Commission Herr_Fligely Bruhns_Forsch

Ibanez;. Commissare

Herr Barozzi

Ganahl Perrier

Plantamour Saget Thöt Tinter Vecchi Präsident

Herr_Fligely

Präsident_eröffnen Sitzung sprechen Bedauern permanent_Commission

Mitglied anwesend wodurch beschlussfähig verlesen Schreiben_Herrn Baeyer Heidelberg erkranken Schreiben_Herrn Bauernfeind Bad ge-

brauchen Schreiben_Herrn Hirsch ebenfalls Krankheit reisen ausfahren Schreiben

Herrn Bauernfeind Einladung Name Königl

Bayerische Minister Dr. Lutz nächst Generaleonferenz

München abhalten

Präsident begrüsst anwesend

Herr sprechen Freude

Name französisch

Regierung Herr Oberst Saget Capitain Perrier Yvon Villarceau gegenwärtig Versam

In [178]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')


with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
    for bigram_sentence in bigram_sentences:
            
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
        f.write(trigram_sentence + '\n')

In [179]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [180]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [192]:
#extreme filters hyperparams
no_below = 8
no_above = 0.4

trigram_pages = LineSentence(trigram_sentences_filepath)

# learn the dictionary by iterating over all of the pages
trigram_dictionary = Dictionary(trigram_pages)
    
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below, no_above)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [193]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [194]:
def trigram_bow_generator(filepath):
    """
    generator function to read pages from a file
    and yield a bag-of-words representation
    """
    
    for page in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(page)

In [195]:
MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [196]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [197]:
%%time

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
        
    # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=30,
                           id2word=trigram_dictionary,
                           workers=3)
    
lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 2.15 s, sys: 640 ms, total: 2.79 s
Wall time: 2.93 s


In [198]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [199]:
explore_topic(topic_number=8)

term                 frequency

punkten              0.031
Beobachtung          0.025
Punkt                0.022
Messung              0.016
Instrument           0.013
Längenbestimmungen   0.011
folgen               0.010
listen               0.008
machen               0.008
Stuttgart            0.008
reihen               0.008
Excellenz            0.008
karten               0.007
berichten            0.007
früh                 0.007
ein-                 0.007
Grad                 0.006
Fehler               0.006
Berechnung           0.006
Apparat              0.006
Brocken              0.006
Herr                 0.006
Berlin               0.006
fahren               0.006
Kilometer            0.006


In [200]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [201]:
%%time
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


CPU times: user 9.06 s, sys: 30 ms, total: 9.09 s
Wall time: 49.5 s


In [202]:
pyLDAvis.display(LDAvis_prepared)