In [267]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [268]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oussa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [269]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [270]:
DECADE = '1910'
N_CLUSTERS = 6

# Chargement des fichiers de la décennie

In [271]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [272]:
# Exemple de fichiers
files[:5]

['Bxl_1910_Tome_I1_Part_1.txt',
 'Bxl_1910_Tome_I1_Part_2.txt',
 'Bxl_1910_Tome_I1_Part_3.txt',
 'Bxl_1910_Tome_I1_Part_4.txt',
 'Bxl_1910_Tome_I1_Part_5.txt']

In [273]:
texts = [open(data_path + f,encoding='utf-8').read() for f in files]

In [274]:
# Exemple de textes
texts[0][:400]

'\x0cV I L L E\n\nD E\n\nB R U X E L L E S .\no\n\nBULLETIN COMMUNAL\nA N N É E\n\nP R E M I È R E\n\nTOME\n\nC O M P T E\n\nR E N D U\n\n1910.\n\nP A R T I E .\n\nI.\n\nD E S\n\nS É A N C E S .\n\nB R U X E L L E S\nTYPOGRAPHIE E T LITHOGRAPHIE E . G U Y O T ,\n\nR U E P A C H E C O , 18\n\n\x0c\x0cN°l.\n\nCOMPTE RENDU D E L A SÉANCE D U 1 7 JANVIER 1 9 1 0 .\n\nVILLE\n\nDE\n\nBULLETIN\n\nBRUXELLES\n\nCOMMUNAL\n\nAnnée\n\nCONSEIL\n\n1910\n\nC O M M U N A L\n\n'

# Vectorisation du texte

In [275]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [276]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [277]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

Wall time: 42.6 s


In [278]:
tfidf_vectors

<170x7097 sparse matrix of type '<class 'numpy.float64'>'
	with 252382 stored elements in Compressed Sparse Row format>

In [279]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

enseignant            0.194609
perfectionnement      0.160739
theodor               0.151363
l'honorable           0.140094
sacrés                0.129844
                        ...   
les+conditions        0.000000
les+conseillers.+m    0.000000
les+cours             0.000000
les+enfants           0.000000
'+                    0.000000
Length: 7097, dtype: float64

# Comprendre les vecteurs et leurs "distances"

In [280]:
from scipy.spatial.distance import cosine

In [281]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [282]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [283]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [284]:
tfidf_array = tfidf_vectors.toarray()

In [285]:
tfidf_array[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [286]:
texts[0][:200]

'\x0cV I L L E\n\nD E\n\nB R U X E L L E S .\no\n\nBULLETIN COMMUNAL\nA N N É E\n\nP R E M I È R E\n\nTOME\n\nC O M P T E\n\nR E N D U\n\n1910.\n\nP A R T I E .\n\nI.\n\nD E S\n\nS É A N C E S .\n\nB R U X E L L E S\nTYPOGRAPHIE E T '

In [287]:
tfidf_array[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [288]:
cosine(tfidf_array[0], tfidf_array[1])

0.712899401391456

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [289]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [290]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=7)

In [291]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [292]:
pprint(dict(clustering))

{0: ['Bxl_1910_Tome_I1_Part_8.txt',
     'Bxl_1910_Tome_I2_1_Part_11.txt',
     'Bxl_1910_Tome_I2_1_Part_12.txt',
     'Bxl_1910_Tome_I2_1_Part_7.txt',
     'Bxl_1910_Tome_I2_1_Part_8.txt',
     'Bxl_1910_Tome_I2_1_Part_9.txt',
     'Bxl_1910_Tome_I2_2_Part_12.txt',
     'Bxl_1911_Tome_I1_Part_13.txt',
     'Bxl_1911_Tome_I2_1_Part_10.txt',
     'Bxl_1911_Tome_I2_1_Part_4.txt',
     'Bxl_1911_Tome_I2_1_Part_5.txt',
     'Bxl_1911_Tome_I2_1_Part_6.txt',
     'Bxl_1911_Tome_I2_1_Part_7.txt',
     'Bxl_1911_Tome_I2_1_Part_8.txt',
     'Bxl_1911_Tome_I2_2_Part_1.txt',
     'Bxl_1911_Tome_I2_2_Part_11.txt',
     'Bxl_1912_Tome_I1_2_Part_7.txt',
     'Bxl_1912_Tome_I2_1_Part_10.txt',
     'Bxl_1912_Tome_I2_1_Part_5.txt',
     'Bxl_1912_Tome_I2_1_Part_6.txt',
     'Bxl_1912_Tome_I2_1_Part_7.txt',
     'Bxl_1912_Tome_I2_1_Part_9.txt',
     'Bxl_1912_Tome_I2_2_Part_12.txt',
     'Bxl_1913_Tome_I2_1_Part_5.txt',
     'Bxl_1913_Tome_I2_1_Part_6.txt',
     'Bxl_1913_Tome_I2_1_Part_8.txt',
     'Bx