# Read in Corpus

In [1]:
corpus_path = ('/Users/Kyle/Documents/Box Sync/old/'
               'MARTIN-french-corpus-github/'
               '5_french-news-corpus.txt')
RAW_CORPUS = open(corpus_path, 'r').read()
print(len(RAW_CORPUS)) # how much did I start with?

141713828


## Remove non-interesting information

For better n-grams!

1. Remove a `cookie_message` on all of the sites that sort of messes with our ability to pull meaningful n-grams out of the corpus.

2. Remove a slogan (`lemonde_slogan`) that's at the bottom of all articles by [lemonde.fr](https://lemonde.fr)

3. Related info message (`lemonde_info`) also removed

In [2]:
import re
undesirables = {}
undesirables['cookie_message'] = ("En poursuivant votre navigation sur ce site, "
                                  "vous acceptez nos CGV et l\’utilisation de cookies "
                                  "pour vous proposer des contenus et services adaptés"
                                  " à vos centres d’intérêts et vous permettre "
                                  "l\'utilisation de boutons de partages sociaux."
                                  r"\s+En savoir plus et gérer ces paramètres\.")

undesirables['lemonde_slogan'] = (r"Découvrez chaque jour toute l\'info en\s+"
                                  r"direct \(de la politique à l\'économie en "
                                  r"passant par le sport et la\s+"
                                  r"météo\) sur Le Monde\.fr, le site de news "
                                  r"leader de la presse française\s+"
                                  r"en ligne\.")

undesirables['lemonde_info'] = ("Journal d\'information en ligne, "
                                "Le Monde.fr offre à ses visiteurs "
                                r"un\s+panorama complet de l\'actualité.")


In [3]:
def remove_undesirables(message, name, corpus):
    """ receives a dictionary organized by 
        phenom and message to be removed. """
    amount = len(re.findall(message, corpus, flags=re.IGNORECASE))
    filtered_corpus = re.sub(message, # to be subbed
                             "", # remove this
                             corpus, # current state of corpus
                             flags=re.IGNORECASE) # just to be sure
    print(f'removed {amount} instances of {name} `{message[:10]}(...)`')
    
    return filtered_corpus

In [4]:
len(re.findall(undesirables['lemonde_info'], RAW_CORPUS, flags=re.IGNORECASE))

7142

## Corpus filtration pipeline

In [5]:
def filtration_pipeline(RAW_CORPUS):
    """ filters the corpus removing the boogers for a breath of fresh air """
    
    no_cookies = remove_undesirables(undesirables['cookie_message'],
                                     'cookie_message', 
                                     RAW_CORPUS)

    no_lemonde = remove_undesirables(undesirables['lemonde_slogan'],
                                     'lemonde_slogan',
                                     no_cookies)

    no_info = remove_undesirables(undesirables['lemonde_info'],
                                  'lemonde_info',
                                  no_lemonde)

    filtered = no_info

    return filtered

In [6]:
raw_filtered = filtration_pipeline(RAW_CORPUS)

removed 7156 instances of cookie_message `En poursui(...)`
removed 7142 instances of lemonde_slogan `Découvrez (...)`
removed 7142 instances of lemonde_info `Journal d'(...)`


# Post-baptism n-grams

To complete this aspect of the project I will use the NLTK n-gram feature. I will first tokenize all of the text. Then I will pickle the tokens so that I can avoid reprocessing the mass of text each time I run the script. There is a simple implementation of n- grams I used in a previous assignment, and will be able to just reduplicate that code for this step.

In [7]:
from collections import Counter

from nltk.util import ngrams
from nltk.tokenize import word_tokenize

## `nltk.word_tokenize` everything

In [8]:
nltk_tokens = word_tokenize(raw_filtered)

## Calculate n-grams

In [9]:
bigrams = list(Counter(ngrams(nltk_tokens, 2)).items())
trigrams = list(Counter(ngrams(nltk_tokens, 3)).items())
fourgrams = list(Counter(ngrams(nltk_tokens, 4)).items())
fivegrams = list(Counter(ngrams(nltk_tokens, 5)).items())

grams = [bigrams, trigrams, fourgrams, fivegrams]

In [10]:
i = 2
for gram in grams:
    print(f'{i}-grams')
    print(len(gram), 'grams')
    i += 1

2-grams
4304622 grams
3-grams
12909766 grams
4-grams
19879549 grams
5-grams
23048889 grams


## Display Results  

In [11]:
len(set(fourgrams))

19879549

In [12]:
sorted(fivegrams, key=lambda x: x[1], reverse=True)[:1000]

[(('•', 'Mis', 'à', 'jour', 'le'), 5063),
 (('--', '--', '--', '--', '--'), 2971),
 (('ce', 'n', '’', 'est', 'pas'), 1239),
 (('!', '!', '!', '!', '!'), 1143),
 (('il', 'n', '’', 'y', 'a'), 1134),
 (('En', 'poursuivant', 'votre', 'navigation', 'sur'), 1133),
 (('qu', '’', 'il', 'n', '’'), 1042),
 (("d'un", 'droit', "d'accès", ',', 'de'), 1040),
 (('disposez', "d'un", 'droit', "d'accès", ','), 1026),
 (('droit', "d'accès", ',', 'de', 'rectification'), 1022),
 (('.', 'Conformément', 'à', 'la', 'loi'), 1012),
 (("d'accès", ',', 'de', 'rectification', 'et'), 991),
 ((',', 'de', 'rectification', 'et', "d'opposition"), 991),
 (('de', 'rectification', 'et', "d'opposition", 'aux'), 991),
 (('rectification', 'et', "d'opposition", 'aux', 'données'), 991),
 (('et', "d'opposition", 'aux', 'données', 'vous'), 991),
 (("d'opposition", 'aux', 'données', 'vous', 'concernant'), 991),
 (('aux', 'données', 'vous', 'concernant', 'en'), 991),
 (('.', 'Aujourd', '’', 'hui', ','), 967),
 (('les', 'newsletter

# Markov Chains

I will pass my corpus through [`markovify`](https://github.com/jsvine/markovify).



In [13]:
import markovify

In [14]:
text_model = markovify.Text(raw_filtered)

In [15]:
for i in range(10):
    print(text_model.make_sentence(tries=100))
    print()

Instaurez une atmosphère d'incompréhension, de défiance, j'avoue ne plus attribuer d’épreuves à la loi Informatique et Libertés du 6 janvier 1978 relative à l'informatique, aux fichiers et aux voies légales pour ramener le jeune Laurent, à seulement 38 ans, passent la plus corrompue de Poutine avec admiration.

On a vu des personnes hébergées à l’hôtel, raconte Mamadou Ba.

Il indique l'acidité par une zone marécageuse.

De même, alors que toute nouvelle version du plaignant principal est-elle crédible?

Sinon, je vais le garder en tête que les faucons sont la Chine et le Jiangsu.

« Amorcée le 15 février 2018Dossier Faire parler de son père: décidemment la fréquentation des maîtres les plus consultés actuellement ? Le goulet de quelque façon que la tête du classement des pays ACP constitue un avantage considérable à nos ados, exhibant des corps était paralysé.

» Si elle resurgit pendant les fêtes de cour, le personnel d'un édile lui est faite de la forme ultime du monde économique.



# Collocates

Finding differences in collocates in four different words that mean 'exit' in French.

In [61]:
tok_enum = list(enumerate(nltk_tokens))

In [16]:
len(nltk_tokens)

26445037

## All the verb forms

In [16]:
sortir_forms = set(('sortir sors sort sortons sortez sortent sorti '
                    'sortis sortais sortait sortiez sortiez sortaient '
                    'sortis sortit sortîtes sortîmes sortirent sortirai '
                    'sortiras sortira sortirons sortirez sortiront '
                    'sortirais sortirait sortirions sortiriez sortiraient '
                    'sorte sortes sorte sortions sortiez sortent sortisse sortisses '
                    'sortît sortissions sortissiez sortissent sortant').split())

partir_forms = set(('partir pars part partons partez partent parti partis partais '
                    'partait partions partiez partaient partîmes partirent '
                    'partirai partiras partira partirons partirez partiront '
                    'partirais partirait partirions partiriez partiraient '
                    'parte partes parte partions partiez partisse partisses '
                    'partît partissions partissiez partissent partant').split())

quitter_forms = set(('quitter quitte quittes quitte quittons quittez quittent quittais '
                     'quittais quittait quittai quittas quitta quittâmes quittâmes '
                     'quittâtes quittèrent quitté quitterai quitteras quittera '
                     'quitterons quitterez quitteron quitterais quitterais '
                     'quitterait quitterions quitteriez quitteraient quitte '
                     'quittes quittions quittiez quittent quittant quittasse '
                     'quittasses quittât quittassions quittassiez quittassent').split())

laisser_forms = set(('laisser laisse lasses laisse laissons laissez laissent laissais '
                     'laissé laissait laissions laissiez laissaient laissai laissas '
                     'laissa laissâmes laissâtes laissèresnt laisserai laisseras laissera '
                     'laisserons laisserez laisseront laisserais laisserais laisserait '
                     'laisserions laisseriez laisseraient laisse laisses laisse laissions '
                     'laissiez laissasse laissesses laissât laissassions laissassiez '
                     'laissassent laissant').split())

In [98]:
def finder(nltk_tokens, verb_forms, n_words):
    """ pass in all the verb forms and return the context surrounding each verb"""
    indexes = [i for i, word in enumerate(nltk_tokens) if word in verb_forms]
    results = []
    for i in indexes:
        context = (nltk_tokens[i - n_words:i] +         # words to the left
                  # ['__' + nltk_tokens[i] + '__'] +       # search word
                  [nltk_tokens[i]] +                    # search word
                  nltk_tokens[i + 1:i + n_words + 1])   # words to the right
        results.append(context)
    print('len of results', len(results))
    return results

## Find and store the four verbs and their contexts

In [100]:
sortir_concordance = finder(nltk_tokens, sortir_forms, 5)

len of results 10484


In [101]:
partir_concordance = finder(nltk_tokens, partir_forms, 5)

len of results 22075


In [102]:
quitter_concordance = finder(nltk_tokens, quitter_forms, 5)

len of results 3886


In [103]:
laisser_concordance = finder(nltk_tokens, laisser_forms, 5)

len of results 8290


In [106]:
all_concordances = [sortir_concordance,
                    partir_concordance,
                    quitter_concordance,
                    laisser_concordance]

## Process the concordance results

Modifying `spaCy` to tag for part of speech, but disabling `parser` and `ner`.

In [113]:
import spacy
nlp = spacy.load('fr', disable=['parser', 'ner'])

In [52]:
len(sortir_concordance)

10484

In [108]:
UNDESIRED = ['PUNCT', 'NUM'] # could add determiners 'DET' here

In [118]:
def framer(concordance_list):
    """ takes the concordance list and 
        returns a list of relevant pairs"""
    i = 0
    results = []
    for phrase in concordance_list:
        spacyd = nlp(' '.join(phrase))

        psd = [(token.text, token.pos_) for token in spacyd if \
               token.pos_ not in UNDESIRED]
        results.append(psd)

    i += 1
    if i % 1000 == 0:
        print(i)
    
    return results

In [119]:
tagged_concs = []
for i, c in enumerate(all_concordances):
    tagged_concs.append(framer(c))
    print('done with ', i)

done with  0
done with  1
done with  2
done with  3


In [120]:
import pickle

### Pickle the processed results

This notebook is getting really big, and slow to run. Pickling allows me to close kernal, and reload the variables that I want at this point.

In [122]:
with open('tagged_verb_concs.pkl', 'wb') as f:
    pickle.dump(tagged_concs, f)

### Open the processed results

In [3]:
import pandas as pd
import pickle # reopen after closing the kernal

In [4]:
with open('tagged_verb_concs.pkl', 'rb') as f:
    tagged_concs = pickle.load(f)

## Align, create, and display `FreqDists` 

In [47]:
from nltk.probability import FreqDist

Note the format of `tagged_concs`:

0. sortir_concordance
    - phrase
        - `(word, POS)`
        - `(word, POS)`
        - `(word, POS)`
    - phrase
        - `(word, POS)`
        - `(word, POS)`
        - `(word, POS)`
1. partir_concordance
    - phrase (etc.)
2. quitter_concordance
3. laisser_concordance

In [160]:
sortir = tagged_concs[0]
partir = tagged_concs[1]
quitter = tagged_concs[2]
laisser = tagged_concs[3]

In [228]:
def aligner(tagged_conc, verb_forms, display='pos'):
    """ Pass in `tagged_conc` and `verb_forms` 
        process them and print the results. 
        
        `display` can be set to `pos` or `word`
        default for `display` == 'pos'
        """
    
    aligned = []
    # iterate over phrases
    for phrase in tagged_conc:
        # iterate over items in phrase
        for i, (word, pos) in enumerate(phrase):
            # align words around node verb form
            if word in verb_forms and pos == 'VERB':
                context = {}
                # select everything to the left
                context['left'] = phrase[:i]
                # select the found form of the verb
                context['node'] = phrase[i]
                # select everything to the right
                context['right'] = phrase[i + 1:]
                aligned.append(context)
                
    """ Pass in aligned searches and display the context """
    
    # ************************************************************
    # Build the lists
    # ************************************************************
    
    # (l|r)_dist is the left or right distribution of words
    l_dist = {}
    # verb forms
    node = []
    # r_dist
    r_dist = {}

    # ************************************************************
    # Organize the results
    # ************************************************************

    # `display` from kwargs
    if display == 'pos':
        n = 1
    elif display == 'word':
        n = 0
    
    CARE_ABOUT = ['NOUN' 'ADJ', 'PROPN']
    
    for phrase in aligned:
        # left context `reversed` to work from right to left
        for i, (word, pos) in reversed(list(enumerate(phrase['left']))):
            choose = (word, pos)
            if pos in CARE_ABOUT:
                try:
                    l_dist[i] += [choose[n]]
                except KeyError:
                    l_dist[i] = []
                    l_dist[i] += [choose[n]]
        
        # save node words
        node += [(phrase['node'])]

        # right context    
        for i, (word, pos) in enumerate(phrase['right']):
            choose = (word, pos)
            if pos in CARE_ABOUT:
                try:
                    r_dist[i] += [choose[n]]
                except KeyError:
                    r_dist[i] = []
                    r_dist[i] += [choose[n]]
    
    # ************************************************************
    # `FreqDist` of the left and right-hand contexts
    # ************************************************************

    
    dists = [l_dist, r_dist]

    # iterate over each r and l distribution
    which = ['l_dist', 'r_dist']
    
    # `ch` type(int) is which channel, in a given `dist` or distribution
    for ch, dist in enumerate(dists):
        # go over the distribution for each position in order
        total_dist = {}
        
        reverse = True # control flow (big to small)
        # if the channel selected is `'r_dist'` then go small to big
        if which[ch] == 'r_dist':
            reverse = False

        # build `FreqDist` objects from the `value_lists`
        for index, value_list in sorted(dist.items(), reverse=reverse):
            # print(f'{which[ch]}-{key}')
            v_list = []
            # `dist[index]` gets the appropriate `value_list` 
            for k, v in FreqDist(dist[index]).items():
                v_list.append((k, v))
                total_dist[f'{which[ch]}-{index}'] = v_list

    # ************************************************************
    # Display the results
    # ************************************************************

            print(f'{which[ch]}-{index}', sorted(v_list,
                                               reverse=True,
                                               key=lambda x: x[1])[:20])
        if reverse:
            print()
            print('NODE WORD:', sorted(FreqDist(node).items(),
                                       reverse=True,
                                       key=lambda x: x[1])[:10])
            print()
#     return total_dist

In [229]:
aligner(sortir, sortir_forms, display='word')

l_dist-8 [('États-Unis', 1)]
l_dist-7 [('iPad', 1)]
l_dist-5 [('Carax', 1), ('Makhachkala', 1), ('Congrès', 1), ('ENA', 1), ('Cantorbéry', 1), ('Europe', 1), ('Monde', 1), ('Australie', 1), ('OCDE', 1), ('Russie', 1), ('UMP', 1)]
l_dist-4 [('Allemagne', 3), ('iPad', 2), ('UMP', 2), ('Mercy', 1), ('Richet', 1), ('Anzhi', 1), ('Johnson', 1), ('Etats-Unis', 1), ('Clichy', 1), ('Agricole', 1), ('Européens', 1), ('USA', 1), ('Oury', 1), ('Baer', 1), ('Biarritz', 1), ('Bouches-du-Rhône', 1), ('Glénat', 1), ('Monde', 1), ('Croisette', 1), ('Beart', 1)]
l_dist-3 [('France', 10), ('Grèce', 4), ('UMP', 3), ('Madagascar', 3), ('Aubry', 2), ('Bauer', 2), ('Pers', 2), ('Europe', 2), ('Hollywood', 2), ('Russie', 2), ('Manon', 2), ('Demy', 2), ('Allemagne', 2), ('Rwanda', 2), ('Blum', 1), ('JACKSON', 1), ('DSK', 1), ('PC', 1), ('résistance', 1), ('Nosferatu', 1)]
l_dist-2 [('France', 13), ('’', 4), ('Vaincre', 4), ('Europe', 4), ('Amérique', 3), ('Jacques', 3), ('Jean', 2), ('Google', 2), ('Domenech'

In [230]:
aligner(partir, partir_forms, display='word')

l_dist-7 [('Gaulle', 1)]
l_dist-6 [('Dijoux', 1), ('Majesté', 1), ('Massy', 1), ('au', 1), ('Laissez', 1)]
l_dist-5 [('Europe', 3), ('Seine', 2), ('Laure', 1), ('Charles', 1), ('France', 1), ('Loomis', 1), ('Evry', 1), ('Arnaud', 1), ('Cousteau', 1), ('Tirez', 1), ('Sénart', 1), ('Angkor', 1), ('Sarkozy', 1), ('Borloo', 1), ('PCF', 1)]
l_dist-4 [('Hollandais', 2), ('Louis', 2), ('Kim', 1), ('Anne', 1), ('Occident', 1), ('Midi-Pyrénées', 1), ('Sartre', 1), ('Bazar', 1), ('Paris', 1), ('Mendès', 1), ('Ohio', 1), ('Edith', 1), ('Iran', 1), ('Road', 1), ('Rochette', 1), ('Point', 1), ('Fangio', 1), ('Sarkozy', 1), ('UE', 1), ('Lyon', 1)]
l_dist-3 [('France', 18), ('Beltrame', 8), ('Paris', 8), ('Europe', 5), ('Chine', 4), ('Allemagne', 3), ('Laissez', 3), ('Kong', 2), ('Magnan', 2), ('Maurice', 2), ('FRÉQUENTES', 2), ('USA', 2), ('Kyoto', 2), ('Lyon', 2), ('Aborigènes', 2), ('Pompidou', 2), ('Quesada', 2), ('AQMI', 2), ('Gaulle', 2), ('Madagascar', 2)]
l_dist-2 [('France', 15), ('Paris', 1

In [232]:
aligner(laisser, laisser_forms, display='word')

l_dist-6 [('Hahaha', 1), ('Océan', 1)]
l_dist-5 [('Avastin', 1), ('Etats-Unis', 1), ('Loire', 1), ('UPF', 1), ('Amri', 1), ('Zita', 1), ('Poveda', 1), ('Jourdain', 1), ('Afghanistan', 1), ('Evra', 1), ('Macron', 1), ('Villepin', 1)]
l_dist-4 [('Figaro', 1), ('Buffon', 1), ('Guibert', 1), ('Esther', 1), ('Fig', 1), ('Israel', 1), ('Express', 1), ('Haute', 1), ('ONU', 1), ('ÉGALITÉ', 1), ('Picamoles', 1), ('United', 1), ('Vinci', 1), ('Europe', 1), ('PC', 1), ('Unis', 1), ('Orient', 1), ('Durand', 1), ('Bourguiba', 1), ('Jos', 1)]
l_dist-3 [('France', 8), ('Sarkozy', 4), ('Paris', 4), ('Brésil', 3), ('Chalosse', 2), ('Gage', 2), ('Mediaset', 2), ('Morinière', 2), ('Berlusconi', 2), ('Ligonnès', 2), ('Russie', 2), ('Gaulle', 2), ('Mexicaine', 2), ('Stone', 2), ('Rien', 2), ('Cambus', 2), ('Chirac', 2), ('Jean', 2), ('Europe', 2), ('Eric', 2)]
l_dist-2 [('France', 4), ('’', 4), ('Bruxelles', 4), ('Chine', 3), ('Paris', 3), ('Paul', 3), ('Hitler', 3), ('Sarkozy', 3), ('Clermont-Ferrand', 2)

In [233]:
aligner(quitter, quitter_forms, display='word')

l_dist-5 [('Oui', 1), ('Laâge', 1), ('Blake', 1), ('Gomez', 1)]
l_dist-4 [('Allemagne', 2), ('Turquie', 1), ('PSA', 1), ('Nagui', 1), ('France', 1), ('Iran', 1), ('Chirac', 1), ('B.', 1), ('Kadhafi', 1), ('Jospin', 1), ('Aquitaine', 1), ('Kenpo', 1), ('Ivoire', 1), ('Octavio', 1), ('Selena', 1), ('Samuel', 1), ('CIA', 1), ('Elysée', 1), ('Wehrmacht', 1)]
l_dist-3 [('Allemagne', 4), ('Dempsey', 3), ('Preynat', 2), ('France', 2), ('Paris', 2), ('Kadhafi', 2), ('Lionel', 2), ('Gear', 2), ('Blanches', 2), ('Walker', 2), ('Marci', 1), ('Espanyol', 1), ('Staël', 1), ('Carvalho', 1), ('Sanogo', 1), ('Daw', 1), ('Thierry', 1), ('Bayern', 1), ('Raynaud', 1), ('Laurioux', 1)]
l_dist-2 [('France', 7), ('François', 5), ('Patrick', 3), ('’', 3), ('Facebook', 3), ('Bernard', 2), ('ONU', 2), ('Nicolas', 2), ('Paul', 2), ('Jamais', 2), ('Trévedy', 2), ('Chine', 2), ('Beaudoin', 2), ('Sétois', 1), ('Scampia', 1), ('Ottomans', 1), ('Lolo', 1), ('Biya', 1), ('Love', 1), ('Poursin', 1)]
l_dist-1 [('Paris'

# NER Analysis

Frequency distribution of named entities for each domain in corpus

In [259]:
from glob import glob
import justext 
import spacy
import re
nlp = spacy.load('fr_core_news_sm')

In [252]:
PAGE_GLOB = glob('/Users/Kyle/Documents/archives/Winter 2018/'
                 'corpus-linguistics/french-news-html/'
                 'goscraper/20mar-links/*.html')

In [243]:
def get_text(page):
    """This function takes an html page from a glob, for exmaple,
       and reads it and uses the justext module remove all boilerplate"""

    # reads the file
    page_string = open(page, 'rb').read()
    # creates a justext object
    paragraphs = justext.justext(page_string,
                                 justext.get_stoplist("French"))
    pageText = ''
    # if not boilerplate, adds to `pageText`
    for p in paragraphs:
        if not p.is_boilerplate:
            pageText += p.text + ' '
    return pageText

## Link Index

Create an index link to scrape number.

In [244]:
index_file = open('golang-link-index.txt', 'r').readlines()

url_index = []
for line in index_file:
    divided = line.split()
    url = re.sub(',', '', divided[0])
    index = re.findall(r'\d{2}mar-links/(\d+\.html)', divided[1])[0]
    url_index.append((url, index))

In [256]:
def get_url(page_number):
    """ take the indexed filename and return the actual link """
    url = [url for url, index in url_index if index == page_number]
    return url[0]

In [246]:
def domain_analyze(url):
    """ determines what domain the url is from """
    
    re_domain = r'https?://((?:www\.)?.*?\..*?)/'
    domain = re.search(re_domain, url).group(1)

    return domain

In [247]:
class ArticleData:
    """ A place to store all of the things we find in each article 
    
        `nes` is 'named enetities' """
    def __init__(self, article_text, url, domain, entity_labels, nes):
        self.article_text = article_text
        self.url = url
        self.domain = domain
        self.entity_labels = entity_labels
        self.nes = nes

In [260]:
# `PAGE_GLOB` is a list of scraped pages `1.html`, `2.html`, etc.
# `indexed_path_html` is a file path to the stored html on my computer
result_list = []

TOTAL = len(PAGE_GLOB)
i = 0
for indexed_path_html in PAGE_GLOB:
    # get text
    article_text = get_text(indexed_path_html)
    # get the page number out of the path
    page_number = re.findall(r'(\d+\.html)', indexed_path_html)[0]    
    # get the url
    url = get_url(page_number)
    # get the domain
    domain = domain_analyze(url)

    # analyze the text
    doc = nlp(article_text)
    
    # store the labels
    labels = []
    nes = []
    
    # get the named entities
    for ent in doc.ents:
        labels.append(ent.label_)
        nes.append(ent.text)
    
    Results = ArticleData(article_text, url, domain, labels, nes)

    if i % 1000 == 0:
        print(i)
        
    i += 1
    
    result_list.append(Results)

    

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000


In [268]:
len(result_list)

23257

In [265]:
with open('french_corpus_info.pkl', 'wb') as f:
    pickle.dump(result_list, f)

In [269]:
with open('french_corpus_info.pkl', 'rb') as fa:
    asdf = pickle.load(fa)

In [270]:
len(asdf) # proves that the pickle worked.

23257

In [273]:
# article_text, url, domain, entity_labels, nes

domain_distribution = {}

for Res in result_list:
    if Res.domain not in domain_distribution.keys():
        domain_distribution[Res.domain] = 1    
    else:
        domain_distribution[Res.domain] += 1

In [275]:
len(domain_distribution.keys())

561

## Most common named entities

In [299]:
most_common_nes = {}
for domain in domain_distribution.keys():
    for Res in result_list:
        if Res.domain == domain:
            if domain not in most_common_nes.keys():
                most_common_nes[domain] = Res.nes
            else:
                most_common_nes[domain] += Res.nes

# take the 30 most common domains
most_common_domains = FreqDist(domain_distribution).most_common(30)    

for domain, count in most_common_domains:
    for domain2, nes in most_common_nes.items():
        if domain == domain2:
            print(count, domain, FreqDist(nes).most_common(11))
            print()

7168 www.lemonde.fr [('\n', 130153), ('Le Monde.fr', 43052), ('Découvrez', 21580), ('CGV', 21466), ('–', 16523), ('Monde\n|', 14476), ('France', 11790), ('Etat', 10478), ('Paris', 8773), ('\n|\n', 7452), ('Etats-Unis', 6474)]

2348 www.lefigaro.fr [('\n', 40859), ('France', 8927), ('la France', 6543), ('Paris', 5029), ('Français', 4750), ('Jean', 2661), ('Europe', 2583), ('Etat', 2523), ('Russie', 2451), ('État', 2163), ('Allemagne', 2052)]

740 www.liberation.fr [('C’', 2796), ('France', 1722), ('Paris', 1606), ('AFP', 1446), ('J’', 1443), ('m’', 1359), ('Etat', 1272), ('la France', 954), ('jusqu’', 846), ('Français', 837), ('–', 801)]

727 www.futura-sciences.com [('Moteur de recherche', 1297), ('Futura Publié', 888), ('Futura', 864), ('la Terre', 729), ('Vous avez', 330), ('Terre', 330), ('France', 330), ('États-Unis', 318), ('Modifié', 306), ('Lune', 297), ('CO2', 294)]

609 www.sudouest.fr [('C’', 1420), ('Bordeaux', 987), ('J’', 781), ('Jean', 774), ('jusqu’', 642), ('–', 539), (

## Most common categories of NEs

In [291]:
most_common_nes = {}
for domain in domain_distribution.keys():
    for Res in result_list:
        if Res.domain == domain:
            if domain not in most_common_nes.keys():
                most_common_nes[domain] = Res.entity_labels
            else:
                most_common_nes[domain] += Res.entity_labels

most_common_domains = FreqDist(domain_distribution).most_common(30)    

for domain, count in most_common_domains:
    for domain2, nes in most_common_nes.items():
        if domain == domain2:
            print(count, domain, FreqDist(nes).most_common(11))

7168 www.lemonde.fr [('LOC', 164805), ('PER', 113350), ('ORG', 67072), ('MISC', 57506)]
2348 www.lefigaro.fr [('LOC', 73784), ('PER', 53645), ('MISC', 30309), ('ORG', 29653)]
740 www.liberation.fr [('LOC', 16917), ('PER', 12913), ('ORG', 7800), ('MISC', 7309)]
727 www.futura-sciences.com [('LOC', 5383), ('PER', 4079), ('MISC', 3349), ('ORG', 2019)]
609 www.sudouest.fr [('LOC', 9432), ('PER', 7176), ('MISC', 3041), ('ORG', 2184)]
513 www.lexpress.fr [('PER', 13015), ('LOC', 12176), ('MISC', 5729), ('ORG', 4425)]
501 www.huffingtonpost.fr [('PER', 5118), ('LOC', 4279), ('MISC', 3216), ('ORG', 2541)]
498 www.lesechos.fr [('LOC', 8545), ('PER', 6344), ('ORG', 4228), ('MISC', 2854)]
463 www.parismatch.com [('PER', 11843), ('LOC', 9500), ('MISC', 4522), ('ORG', 2613)]
416 www.la-croix.com [('LOC', 20074), ('PER', 15622), ('MISC', 9077), ('ORG', 6129)]
336 www.jeuneafrique.com [('LOC', 8961), ('PER', 6654), ('ORG', 2975), ('MISC', 2481)]
315 www.ladepeche.fr [('LOC', 6269), ('PER', 4666), ('M

## Average article length

In [293]:
from statistics import mean

In [298]:
most_common_nes = {}
for domain in domain_distribution.keys():
    for Res in result_list:
        if Res.domain == domain:
            if domain not in most_common_nes.keys():
                most_common_nes[domain] = [len(Res.article_text)]
            else:
                most_common_nes[domain] += [len(Res.article_text)]

most_common_domains = FreqDist(domain_distribution).most_common(30)    

for domain, count in most_common_domains:
    for domain2, lengths in most_common_nes.items():
        if domain == domain2:
            print('{:5} {:<25} {:<20}'.format(count, domain, mean(lengths)))

 7168 www.lemonde.fr            5660.9189453125     
 2348 www.lefigaro.fr           9214.779386712094   
  740 www.liberation.fr         6408.097297297298   
  727 www.futura-sciences.com   3276.977991746905   
  609 www.sudouest.fr           3494.630541871921   
  513 www.lexpress.fr           8097.479532163743   
  501 www.huffingtonpost.fr     3886                
  498 www.lesechos.fr           5606.31124497992    
  463 www.parismatch.com        5486.088552915767   
  416 www.la-croix.com          12341.701923076924  
  336 www.jeuneafrique.com      5378.440476190476   
  315 www.ladepeche.fr          5472.355555555556   
  312 www.lepoint.fr            4192.75             
  311 www.slate.fr              7220.5337620578775  
  297 www.rfi.fr                5564.461279461279   
  280 www.capital.fr            6249.425            
  256 www.20minutes.fr          3051.62109375       
  240 www.contrepoints.org      9598.045833333334   
  210 www.europe1.fr            2955.095238095

## Average TTR

In [300]:
from nltk.tokenize import word_tokenize

In [306]:
def ttr(article_text):
    """ Returns the nltk-word_tokenized type to token ratio of the article """
    tokens = word_tokenize(article_text)
    types = set(tokens)
    
    try:
        ttr = len(types) / len(tokens)
    except ZeroDivisionError:
        ttr = 0

    return ttr

In [307]:
most_common_dict = {}
for domain in domain_distribution.keys():
    for Res in result_list:
        if Res.domain == domain:
            if domain not in most_common_dict.keys():
                most_common_dict[domain] = [ttr(Res.article_text)]
            else:
                most_common_dict[domain] += [ttr(Res.article_text)]

most_common_domains = FreqDist(domain_distribution).most_common(30)    

for domain, count in most_common_domains:
    for domain2, lengths in most_common_dict.items():
        if domain == domain2:
            print('{:5} {:<25} {:<20}'.format(count, domain, mean(lengths)))

 7168 www.lemonde.fr            0.4661329469209219  
 2348 www.lefigaro.fr           0.43289061416688746 
  740 www.liberation.fr         0.4202459404368624  
  727 www.futura-sciences.com   0.5196529997634026  
  609 www.sudouest.fr           0.4568850933436745  
  513 www.lexpress.fr           0.4407010383993757  
  501 www.huffingtonpost.fr     0.5106185259951724  
  498 www.lesechos.fr           0.46647912801407854 
  463 www.parismatch.com        0.4395420449712281  
  416 www.la-croix.com          0.20066893782128942 
  336 www.jeuneafrique.com      0.43589807879973336 
  315 www.ladepeche.fr          0.4850935107965183  
  312 www.lepoint.fr            0.5178235479838548  
  311 www.slate.fr              0.42808490388224946 
  297 www.rfi.fr                0.46192844432233415 
  280 www.capital.fr            0.43275189344074005 
  256 www.20minutes.fr          0.48937338437936606 
  240 www.contrepoints.org      0.37149578826822655 
  210 www.europe1.fr            0.493045611108