In [1]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oussa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [31]:
DECADE = '1910'
N_CLUSTERS = 10

# Chargement des fichiers de la décennie

In [5]:
files = [f for f in sorted(os.listdir(data_path))]

In [6]:
# Exemple de fichiers
files[:5]

['Bxl_1847_Tome_I1_Part_1.txt',
 'Bxl_1847_Tome_I1_Part_2.txt',
 'Bxl_1847_Tome_I1_Part_3.txt',
 'Bxl_1847_Tome_I1_Part_4.txt',
 'Bxl_1847_Tome_I1_Part_5.txt']

In [7]:
texts = [open(data_path + f,encoding='utf-8').read() for f in files]

In [8]:
# Exemple de textes
texts[0][:400]

"V I L L E DE\n\nBRUXELLES\n\nbulletin ires 8éanas\nDl!\n\nCONSEIL\n\nCOMMUNAL\n\nANNÉE\n\n1847.\n\n\x0cAU\n\n\x0cVILLE DE B R U X E L L E S .\n\nbulletin\n\nCONSEIL\n\nàes\n\nSéances\n\nCOMMUNAL.\n\nANNÉE\n\n1847.\n\nBRUXELLES,\nIMPRIMERIE\n\nD E J . H. B R I A R D ,\n\nRITE N E U V E , 3 1 , FAUBOURG DE N A M U R ,\n\n1 84 8\n\n\x0cDE!\n\nDU CONSEI\nDîBÏ\n\nE. - Communication\nconclusions de la section des\ndu nouvel hospice pour les av\n\nEnraisonde l'ab"

# Vectorisation du texte

In [9]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [11]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [12]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

Wall time: 21min 5s


In [13]:
tfidf_vectors

<2824x6863 sparse matrix of type '<class 'numpy.float64'>'
	with 4064664 stored elements in Compressed Sparse Row format>

In [14]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

marchés          0.439471
couverts         0.182487
l'honorable      0.180322
mètre            0.168858
pieds            0.166052
                   ...   
le+service       0.000000
le+territoire    0.000000
leblanc          0.000000
leclercq         0.000000
#                0.000000
Length: 6863, dtype: float64

# Comprendre les vecteurs et leurs "distances"

In [15]:
from scipy.spatial.distance import cosine

In [16]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [17]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [18]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [19]:
tfidf_array = tfidf_vectors.toarray()

In [20]:
tfidf_array[0]

array([0.        , 0.00298509, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [21]:
texts[0][:200]

'V I L L E DE\n\nBRUXELLES\n\nbulletin ires 8éanas\nDl!\n\nCONSEIL\n\nCOMMUNAL\n\nANNÉE\n\n1847.\n\n\x0cAU\n\n\x0cVILLE DE B R U X E L L E S .\n\nbulletin\n\nCONSEIL\n\nàes\n\nSéances\n\nCOMMUNAL.\n\nANNÉE\n\n1847.\n\nBRUXELLES,\nIMPRIMERIE\n'

In [22]:
tfidf_array[0]

array([0.        , 0.00298509, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [23]:
cosine(tfidf_array[0], tfidf_array[1])

0.5920833051953662

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [32]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [33]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=10)

In [34]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [35]:
pprint(dict(clustering))

{0: ['Bxl_1896_Tome_I1_2_Part_1.txt',
     'Bxl_1939_Tome_III_Part_2.txt',
     'Bxl_1939_Tome_III_Part_4.txt',
     'Bxl_1939_Tome_III_Part_5.txt',
     'Bxl_1939_Tome_I_Part_8.txt',
     'Bxl_1949_Tome_II_Part_3.txt',
     'Bxl_1949_Tome_I_Part_1.txt',
     'Bxl_1949_Tome_I_Part_5.txt',
     'Bxl_1953_Tome_II_Part_3.txt',
     'Bxl_1953_Tome_I_Part_12.txt',
     'Bxl_1953_Tome_I_Part_4.txt',
     'Bxl_1953_Tome_I_Part_8.txt',
     'Bxl_1958_Tome_II_Part_10.txt',
     'Bxl_1960_Tome_II1_Part_3.txt',
     'Bxl_1962_Tome_II1_Part_3.txt',
     'Bxl_1963_Tome_II1_Part_5.txt',
     'Bxl_1963_Tome_II1_Part_6.txt',
     'Bxl_1963_Tome_II1_Part_8.txt',
     'Bxl_1963_Tome_II1_Part_9.txt',
     'Bxl_1963_Tome_II2_Part_1.txt',
     'Bxl_1963_Tome_II2_Part_10.txt',
     'Bxl_1963_Tome_II2_Part_2.txt',
     'Bxl_1963_Tome_II2_Part_3.txt',
     'Bxl_1963_Tome_II2_Part_4.txt',
     'Bxl_1963_Tome_II2_Part_5.txt',
     'Bxl_1963_Tome_II2_Part_8.txt',
     'Bxl_1963_Tome_II2_Part_9.txt',
     'Bxl_19