In [None]:
import spacy
import pandas as pd
import os

In [None]:
from lxml import etree
from datetime import datetime
from article_selection import article_selection
import json

## Helper functions

These functions are used to handle the data and extract the wanted articles from the 
'.xml'files and store them into an array of strings. 

In [None]:
#Takes a start and an end date and retuns a list of all the months in between of the form mm/yyyy.
def month_dates(start, end):
    f = lambda date: date.month + 12 * date.year

    res = []
    for tot_m in range(f(start)-1, f(end)):
        y, m = divmod(tot_m, 12)
        res.append(str(y) + '/' + '%02d' % (m+1))
    
    return res

In [None]:
#Takes an article as input and output the date of this articles in the format dd/mm/yyy. 
def get_date(article):
    """
    This method returns the date of the article
    """
    str_date = article.find('entity').find('meta').find('issue_date').text
    return datetime.strptime(str_date, '%d/%m/%Y')

In [None]:
#Get all the articles in a '.xml' file and store the into an array of strings.
def get_articles_in_file(file, start_date, end_date):
    articles = []  
    for article in file.iter('article'):
        if article.find('entity') is not None:
            a = ''
            date = get_date(article)
            if start_date <= date <= end_date:
                for entity in article.iter('entity'):
                    a += entity.findtext('full_text') + ' '
                articles.append(date.strftime('%d/%m/%Y') + ' ' + a)
    return articles

In [None]:
#Using 'get_articles_in_file' goes throught all the '.xml' files in 'path' to store them in an array of strings.
def get_articles(path, start_date, end_date):
    articles = []
    for m_date in month_dates(start_date, end_date):
        try:
            file = etree.parse(path + m_date + '.xml')
            articles.append(get_articles_in_file(file, start_date, end_date))
        except (FileNotFoundError, IOError):
            pass
    return [a for file in articles for a in file]  

In [None]:
#Takes as input a box_id and an '.xml' file and returns the text corresponding to box_id the the '.xml' file.
def get_entity_text(file, box_id):
    res = None
    for article in file.iter('article'):
        if article.find('entity') is not None:
            date = get_date(article)
            for entity in article.iter('entity'):
                if   box_id == entity.find('meta').find('box').text:
                    res = date.strftime('%d/%m/%Y') + ' ' + entity.findtext('full_text')
                    break
    return res

In [None]:
path = '/home/mbanga/Desktop/JDG/'
start_date =  datetime(1990, 1, 1)
end_date = datetime(1998, 2, 28)

In [None]:
articles = get_articles(path, start_date, end_date)

In [None]:
len(articles)

In [None]:
import fr_core_news_sm
import enchant

In [None]:
nlp = fr_core_news_sm.load()

## Lemmatizations of the articles

Now that we got our articles we decided to lemmatize them before processing a classification algorithm on it so we could get better results.These are the functions we use for the lemmatization.

In [None]:
# Helper function to eliminate tokens that are pure punctuaiton or whitespace.
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuaiton or whitespace
    """
    
    return token.is_punct or token.is_space

In [None]:
# Helper function to eliminate tokens that are not french words.
def is_french(word):
    """
    helper function to eliminate tokens that
    are not french words.
    """
    d = enchant.Dict('fr_FR')
    return d.check(word)

In [None]:
#Generator function to use spaCy to parse articles,lemmatize the text, and yield sentences.
def lemmatized_corpus(corpus):
    """
    generator function to use spaCy to parse articles,
    lemmatize the text, and yield sentences
    """

    pos = ['VERB', 'PROPN', 'NOUN', 'ADJ', 'ADV']
    for parsed_article in nlp.pipe(corpus, 
                                   batch_size=100, n_threads=5):
        # save the date
        date = parsed_article[0].text
        
        yield (date, ' '.join([token.lemma_ for token in parsed_article if token.pos_ in pos]))
                             
        '''if not punct_space(token) and is_french(token.text)
                                and not token.is_stop and not token.is_digit
                                and not token.like_num]))'''

## Naïve Selection

Given the fact that we a huge dataset of articles, We dicide to at first filter the articles using a simple selection by keywords.We initialize an array of string that are related to the tematic of the 'Votation', We might be losing some articles that would be meaningfull but regarding the size of our dataset we are ready to make this concession.We also think that it has some sence to do a keywords selection because it would hard to have an article about 'Votations' that does not cointains any word of our keywords list.

In [None]:
#This fonction take an array of articles and a list of keywords 'lemmas' and returns all the articles that coitains one of more the words in our keywords list
def corpus_votation(articles, lemmas):
    votations = []
    for article in articles:
        if any(lemma in article for lemma in lemmas): 
            votations.append(article)
    return votations

In [None]:
def corpus_votation_bis(articles, lemmas):
    votations = []
    for article in articles:
        if any(lemma in article.replace(' ', '') for lemma in lemmas):
            votations.append(article)
    return votations

In [None]:
# Naive selection (First Filtering)
lems = ['votation', 'referendum']

articles_votation = article_selection(articles, lems)

In [None]:
len(articles_votation)

In [None]:
#Lemmatization of the corpus of aticles selected using our naive selection and storing it in tuple of (dates,articles)
if 0 == 1:
    %%time
    # Time consuming !!
    lemmatized_corpus = [(date, lemmas) for date, lemmas in lemmatized_corpus(articles_votation)]

    # retrieve dates
    dates = [pair[0] for pair in lemmatized_corpus]

    # retrieve articles
    corpus = [pair[1] for pair in lemmatized_corpus]

In [None]:
#Storing the articles we lemmatized before in '.txt' file.
if 0 == 1:
    project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'

    with open(os.path.join(project_path, 'lemmatized articles 1990-1998.txt'), 'w') as file:
        for article in corpus:
            file.write(article + '\n')

In [None]:
#Storing the articles we lemmatized before in '.json' file.
if 0 == 1:
    project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'

    with open(os.path.join(project_path, 'lemmatized articles 1990-1998 json'), 'w') as file:
        json.dump(lemmatized_corpus, file)

In [None]:
if 0 == 1:
    # check ouput of lemmatizer (lemmatized_corpus) 
    file = etree.parse('/home/mbanga/Desktop/JDG/1990/01.xml')
    box_id = '24 123 1446 2167'

    original_text = [get_entity_text(file, box_id)]

    for lemmatized in lemmatized_corpus(original_text):
        print(lemmatized[1], '\n')
    print(original_text)

In [None]:
if 0 == 1:
    # check naive selection
    file = etree.parse('/home/mbanga/Desktop/JDG/1990/01.xml')
    box_id = '50 163 1090 888'

    original_text = [get_entity_text(file, box_id)]
    lemmas = ['vote', 'voter', 'votation', 'referendum']
    res = corpus_votation(original_text, lemmas)

# Filtering articles about votations

> Assumption: The subject of a votation is most likely to be found in
the neighborhoud of the terms 'votation' or 'referendum' in the article. 
So we decided to extract the sentecence that cointais the keywords along with the sentences before and after.We consider that a sentence begins and end with a ',' which is usually the case but since the dataset that we have is not perfectly clean some errors occur collecting sentecens that are not really complete. 

In [None]:
import re

In [None]:
# get all phrases index with the searched term
keywords = ['votation']
#j = 0

articles_votation_sents = []
for article in articles_votation:
    date = re.findall(r'^([^\s]+)', article)[0]
    #print('article', (j+1), ': ', date)
    
    sent = ''
    phrases = article.split('.')    
    for i, phrase in enumerate(phrases):
        if any(keyword in phrase for keyword in keywords):
            if len(phrases) < 2:
                sent += phrase
            elif i == 0:
                sent += phrase[phrase.index(' ') + 1:] + ' '  + phrases[i+1]
            elif i == len(phrases) - 1:
                sent += ' ' + phrases[i-1] + ' ' + phrase
            elif 0 < i < len(phrases) - 1:
                sent += ' ' + phrases[i-1] + ' ' + phrase + ' ' + phrases[i+1]
    articles_votation_sents.append(date + ' ' + sent)
            #print(' {:}'.format(phrases[i-1] + phrase + phrases[i+1]))
    #print('\n')
    #j += 1

In [None]:
len(articles_votation_sents)

In [None]:
articles_votation_sents[501]

In [None]:
if 1 == 1:
    %%time
    # Time consuming !!
    lemmatized_corpus = [(date, lemmas) for date, lemmas in lemmatized_corpus(articles_votation_sents)]

    # retrieve dates
    dates = [pair[0] for pair in lemmatized_corpus]

    # retrieve articles
    corpus = [pair[1] for pair in lemmatized_corpus]

In [None]:
len(lemmatized_corpus)

In [None]:
lemmatized_corpus[501]

# Latent Dirichlet Allocation

Since all the articles that we got in our dataset are in french is was quite difficult to find a training dataset to fit a model that be able to classify our articles.We decide to use the Latent dirichlet allocation as our natural languge processing tool.Our aim was to minimize the bais of our topic classfication of the articles we exctracted.We could assign the mainstream votation topics(i.e army,economy,education...) and try to extract statics regarding a well defined set,but we did not want to make these kind of assumptions about the existance or the importance of topics.

In [None]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, MmCorpus

import pyLDAvis
import pyLDAvis.gensim
import warnings
#import cPickle as pickle

In [None]:
# learn the dictionnary by iterating over all of the articles
dico = Dictionary([article.split() for article in corpus])

# filter tokens that are very rare or too common from
# the dictionary 
dico.filter_extremes(no_below=0, no_above=0.4)

# reassign integer lda
dico.compactify()

In [None]:
#Generator function to read articles from a file and yield a bag-of-words representation.  
def bow_generator(corpus):
    """
    generator function to read articles from a file
    and yield a bag-of-words representation
    """
    for article in corpus:
        yield dico.doc2bow(article.split())

In [None]:
# generate bag-of-word representations for
# all reviews and save them as a matrix
project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'
MmCorpus.serialize(os.path.join(project_path, 'corpus.mm'),
                                bow_generator(corpus))

bow_corpus = MmCorpus(os.path.join(project_path, 'corpus.mm'))

In [None]:
# storing our model
lda_model_filepath = os.path.join(project_path, 'lda_model_all')

In [None]:
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(bow_corpus,
                           num_topics=50,
                           id2word=dico,
                           workers=5)
        
        lda.save(lda_model_filepath)

#load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
# Accept a topic number and print out a formatted list of the top terms.
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
    
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    
    for term, frequency in lda.show_topic(topic_number, topn):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [None]:
explore_topic(topic_number=15, topn=10)

In [None]:
# The goal is to find all documents related to the same topic
def articles_topic(lda, bow_corpus, corpus, topic):
    """
    return the list of articles associated
    with a given topic.
    """
    assert len(bow_corpus) == len(corpus)
    nb_topics = len(lda.get_topics())
    
    documents = []
    if 0 <= topic < nb_topics:
        k = 0
        for bow_article in bow_corpus:
            dist = lda.get_document_topics(bow_article, minimum_probability=0)
            dist = [p[1] for p in dist]
            idx_max = dist.index(max(dist))
            if idx_max == topic:
                documents.append(corpus[k])
            k += 1
    
    return documents

In [None]:
docs = articles_topic(lda, bow_corpus, articles_votation_sents, 10)

In [None]:
docs[1]

In [None]:
if 1 == 1:     
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, dico)

In [None]:
pyLDAvis.display(LDAvis_prepared)