# Extraction de Keywords

## Imports

In [None]:
import os
import yake

from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


## Préparation

### Input

In [None]:
year_selected = 1969

### Nombres de fichiers par mois

In [None]:
# Lister les fichiers de l'année selectionnée
data_path = "../../data/txt/"

files = [f for f in os.listdir(data_path) if f.endswith('.txt') and f.split('_')[2].split('-')[0] == str(year_selected)]

# Imprimer le nombre de fichiers identifiés
print(len(files))

In [None]:
count_year = defaultdict(int)
count_month = defaultdict(int)

for f in files:
    elems = f.split("_")
    year = elems[2].split("-")[0]
    month = elems[2].split("-")[1]
    day = elems[2].split("-")[2]
    count_year[year] += 1
    count_month[month] += 1
print(f"Il y a {count_year[str(year_selected)]} fichiers distribués sur {len(count_month)} mois pour l'année choisie")

In [None]:
count_month_sorted = dict(sorted(count_month.items(), key=lambda x: int(x[0])))

index = np.arange(len(count_month_sorted))
plt.bar(index, count_month_sorted.values())
plt.xlabel('Mois')
plt.ylabel('# documents')
plt.xticks(index, count_month_sorted.keys(), fontsize=8, rotation=30)
plt.title('Nombre de documents par mois')
plt.show()

### Création d'un fichier contenant le texte de tous les journaux de l'année choisie

In [None]:
contents = []
for f in files:
    with open(os.path.join(data_path, f), 'r', encoding='utf-8') as f:
        lines = f.readlines()
        contents.extend(lines)   # add all lines to contents
        #contents.append(f.read())  

In [None]:
# Ecrire tout le contenu dans un fichier temporaire
temp_path = '../data/tmp'
if not os.path.exists(temp_path):
    os.mkdir(temp_path)
with open(os.path.join(temp_path, f'{year_selected}.txt'), 'w', encoding='utf-8') as f:
    #f.write(' '.join(contents))
    f.write(' '.join(contents))

# Imprimer le contenu du fichier et constater les "déchets"
with open(os.path.join(temp_path, f'{year_selected}.txt'), 'r', encoding='utf-8') as f:
    before = f.read()

before[:500]

### Nettoyage du fichier

In [None]:
sw = stopwords.words("french")
sw += ["les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout", 
       "ils", "bien", "sans", "peut", "tous", "après", "ainsi", "donc", "cet", "sous",
       "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non",
       "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", "depuis", "autres",
       "van", "het", "autre", "jusqu"]
sw = set(sw)

In [None]:
# Création d'une fonction de nettoyage
def clean_text(year, folder=None):
    if folder is None:
        input_path = f"{year_selected}.txt"
        output_path = f"{year_selected}_clean.txt"
    else:
        input_path = f"{folder}/{year_selected}.txt"
        output_path = f"{folder}/{year_selected}_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            words = nltk.wordpunct_tokenize(line)
            kept = [w.upper() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
            kept_string = " ".join(kept)
            output.write(kept_string + '\n')
    return f'Output has been written in {output_path}!'

# Nettoyage du fichier
clean_text(year_selected, folder=temp_path)

In [None]:
# Vérifier le résultat
integrated_file=f'{year_selected}.txt'
cleaned_file=f'{year_selected}_clean.txt'

with open(os.path.join(temp_path, cleaned_file), 'r', encoding='utf-8') as f:
    after = f.read()

after[:500]

### Analyse du vocabulaire

In [None]:
# Récupération du contenu des 2 fichiers (avant et après nettoyage)
text = []
text_cleaned = []

with open(os.path.join(temp_path, integrated_file), 'r', encoding='utf-8') as f:
    text = f.read()

with open(os.path.join(temp_path, cleaned_file), 'r', encoding='utf-8') as f:
    text_cleaned = f.read()

In [None]:
# Tokenization in file before cleaning
words = nltk.wordpunct_tokenize(text)
print(f"{len(words)} words found in file before cleaning")

# Tokenization in cleaned file
words_cleaned = nltk.wordpunct_tokenize(text_cleaned)
print(f"{len(words_cleaned)} words found in cleaned file")

In [None]:
words[:10]

In [None]:
words_cleaned[:10]

In [None]:
# Eliminer les stopwords et les termes non alphabétiques
kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
voc = set(kept)
print(f"{len(kept)} words kept ({len(voc)} different word forms)")

In [None]:
# Récupération des mots les plus fréquents
fdist = nltk.FreqDist(kept)
fdist.most_common(10)

In [None]:
# Plot: les n mots les plus fréquents
n = 10
fdist.plot(n, cumulative=True)

In [None]:
fdist.hapaxes()[:30]

In [None]:
n = 30
sorted(voc, key=len, reverse=True)[:n]

## Extraire les mots clés de tous les articles d'une année choisie avec Yake

https://github.com/LIAAD/yake

In [None]:
# Instantier l'extracteur de mots clés
kw_extractor = yake.KeywordExtractor(lan="fr", top=50)
kw_extractor

In [None]:
# Find keywords per file linked to the specific year
for f in sorted(files):
    text = open(os.path.join(data_path, f), 'r', encoding="utf-8").read()
    keywords = kw_extractor.extract_keywords(text)
    print(keywords)

In [None]:
# Find keywords from the integrated file linked to the specific year
text = open(os.path.join(temp_path, integrated_file), 'r', encoding="utf-8").read()
keywords_int = kw_extractor.extract_keywords(text)
print(text[:500])
print(keywords_int)

In [None]:
# Find keywords from the cleaned file linked to the specific year
text = open(os.path.join(temp_path, cleaned_file), 'r', encoding="utf-8").read()
keywords_cleaned = kw_extractor.extract_keywords(text)
print(text[:500])
print(keywords_cleaned)

In [None]:
# Garder les bigrammes par fichier
for f in sorted(files):
    kept = []
    for kw, score in keywords:
        words = kw.split()
        if len(words) == 2:
            kept.append(kw)
    print(f"{f} mentions these keywords: {', '.join(kept)}...")

In [None]:
# Garder les bigrammes du fichier intégré
kept = []
for kw, score in keywords_int:
    words = kw.split()
    if len(words) == 2:
        kept.append(kw)
print(f"{integrated_file} mentions these keywords: {', '.join(kept)}...")

In [None]:
# Garder les bigrammes du fichier intégré
kept = []
for kw, score in keywords_cleaned:
    words = kw.split()
    if len(words) == 2:
        kept.append(kw)
print(f"{cleaned_file} mentions these keywords: {', '.join(kept)}...")