File Preprocessing

In [77]:
!pip install -U numpy
!pip install pyldavis

Requirement already up-to-date: numpy in /opt/conda/lib/python3.7/site-packages (1.16.3)


In [1]:
import os
import codecs
import spacy
import numpy
import gensim
from spacy import displacy
import pandas as pd
import itertools as it

In [2]:
%%time
print('loading french model')
nlp = spacy.load("fr_core_news_md") 


loading french model
CPU times: user 24.6 s, sys: 730 ms, total: 25.3 s
Wall time: 25.5 s


We open the preprocessed file

In [3]:
directory = "../hardy/processed/"
filename = directory + 'hardy.txt'

with codecs.open(filename, encoding='utf_8') as f:
    text = f.read()
text



In [5]:
%%time
nlp.max_length = 1695853
doc = nlp(text)

CPU times: user 24.9 s, sys: 10.8 s, total: 35.8 s
Wall time: 35.9 s


In [6]:
my_stop_words= [u'du', u'de',u'un']

for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    print(lexeme.text)
    lexeme.is_stop = True


du
de
un


In [7]:
tokens = [token for token in doc if not token.is_space and not token.is_stop and not token.is_punct and not token.like_num and not token.is_space]
tokens

[Loisirs,
 Journal,
 événemens,
 parviennent,
 connoissance,
 ms,
 fr,
 mercredi,
 juillet,
 auteur,
 feuille,
 intitulée,
 Séances,
 c.,
 contredit,
 député,
 Vendosme,
 jour,
 trouvoit,
 chose,
 feuille,
 Journal,
 Paris,
 numéro,
 page,
 suiv,
 article,
 Etats,
 généraux,
 lettre,
 adressée,
 Clergé,
 juin,
 Roi,
 Clergé,
 engager,
 réunir,
 ordres,
 lettre,
 adressée,
 auteur,
 feuille,
 intitulée,
 Séances,
 juin,
 dattée,
 mois,
 signée,
 Créniere,
 député,
 Vendosme,
 étoit,
 cru,
 obligé,
 contredire,
 auteur,
 avoit,
 assuré,
 député,
 avoit,
 proposé,
 constituer,
 assemblée,
 Tiers,
 Etat,
 assemblée,
 représentans,
 commettans,
 c.,
 c.,
 soldats,
 délivrés,
 prison,
 Palais,
 Royal,
 quêtoit,
 soldats,
 régiment,
 gardes,
 françoises,
 délivrés,
 prison,
 veille,
 violence,
 étoient,
 Palais,
 Royal,
 avoit,
 placés,
 appartement,
 1er,
 étage,
 galeries,
 côté,
 caffé,
 Foi,
 tendoit,
 mal,
 propos,
 croisée,
 panier,
 corde,
 jardin,
 y,
 recueuillir,
 libéralités,
 publ

In [8]:
token_text = [token.orth_ for token in tokens]
token_text
token_pos = [token.pos_ for token in tokens]
token_pos
token_ent_type = [token.ent_type_ for token in tokens]
token_ent_type
pd.DataFrame(zip(token_text, token_ent_type), columns= ['text', 'ent type'])

Unnamed: 0,text,ent type
0,Loisirs,
1,Journal,MISC
2,événemens,MISC
3,parviennent,
4,connoissance,
5,ms,
6,fr,MISC
7,mercredi,
8,juillet,
9,auteur,


In [9]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [11]:
intermediate_directory = '../hardy/lemmatized/'
def discardable(token):

    return token.is_punct or token.is_space or token.is_stop or token.is_punct or token.like_num

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not discardable(token)])


unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for sentence in lemmatized_sentence_corpus(filename):
        f.write(sentence + '\n')

In [12]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [13]:
for unigram_sentence in it.islice(unigram_sentences, 0, 240):
    print(u' '.join(unigram_sentence))
    print(u'')

loisir Journal événemen parvenir connoissance m

fr

mercredi juillet

auteur feuille intitulée séance c. contredire député Vendosme

jour trouvoit chose feuille journal Paris numéro page suiv

article etat général lettre adresser Clergé juin roi clergé engager réunir ordre lettre adresser auteur feuille intitulée séance juin datter mois signée Créniere député Vendosme étoit croire obliger contredire auteur avoit assurer député avoit proposer constituer assemblée Tiers

etat assemblée représentan commettan

c. c.

soldat délivrer prison palais royal quêtoit

soldat régiment garde françoise délivré prison veille violence étoient palais royal avoit placer appartement premier étage galerie côté caffé Foi tendoit mal propos croisée panier corde jardin y recueuillir libéralité public jugeoit propos faire

/p

374/ faire

sollicitoit faveur auprès roi amnistie avoit entreprendre égard démarche auprès Assemblée national prudence vouloit mêler affaire

fermentation continuoit malheureusement j

In [14]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model')

In [15]:
bigram_model = Phrases(unigram_sentences)

bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [16]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

In [17]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [18]:
for bigram_sentence in it.islice(bigram_sentences, 0, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

loisir Journal événemen parvenir connoissance m

fr

mercredi_juillet

auteur feuille intitulée séance c. contredire député Vendosme

jour_trouvoit chose feuille_journal Paris_numéro page_suiv

article_etat général lettre_adresser Clergé juin roi clergé engager réunir ordre lettre_adresser auteur feuille intitulée séance juin datter mois signée Créniere député Vendosme étoit croire obliger contredire auteur avoit assurer député avoit proposer constituer assemblée Tiers

etat assemblée représentan commettan

c._c.

soldat délivrer prison palais_royal quêtoit

soldat_régiment garde_françoise délivré prison veille violence étoient palais_royal avoit placer appartement premier étage galerie côté caffé Foi tendoit mal_propos croisée panier corde jardin y recueuillir libéralité public jugeoit propos faire

/p

374/ faire

sollicitoit faveur auprès roi amnistie avoit entreprendre égard démarche auprès Assemblée_national prudence vouloit mêler affaire

fermentation continuoit malheureusement j

In [19]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')


with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
    for bigram_sentence in bigram_sentences:
            
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
        f.write(trigram_sentence + '\n')

In [20]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [21]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [33]:
#extreme filters hyperparams
no_below = 20
no_above = 0.5

trigram_pages = LineSentence(trigram_sentences_filepath)

# learn the dictionary by iterating over all of the pages
trigram_dictionary = Dictionary(trigram_pages)
    
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below, no_above)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [34]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [44]:
def trigram_bow_generator(filepath):
    """
    generator function to read pages from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [45]:
MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [46]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [47]:
%%time

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
        
    # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=20,
                           id2word=trigram_dictionary,
                           workers=3)
    
lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 1.97 s, sys: 700 ms, total: 2.67 s
Wall time: 2.79 s


In [48]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [50]:
explore_topic(topic_number=10)

term                 frequency

avoit                0.023
étoit                0.018
y                    0.015
avoyer               0.012
moyen                0.008
ordre                0.007
public               0.007
etat_général         0.006
grand                0.006
assemblée            0.006
Mr                   0.006
fort                 0.005
jour                 0.005
membre               0.005
prendre              0.005
Assemblée_national   0.005
personne             0.004
faire                0.004
citoyen              0.004
venir                0.004
député               0.004
mettre               0.004
lettre               0.004
point                0.004
Tiers                0.004


In [51]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [52]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [53]:
pyLDAvis.display(LDAvis_prepared)