In [None]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import nltk

nltk.download('punkt')

In [None]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [None]:
DECADE = '1870'
N_CLUSTERS = 5

# Chargement des fichiers de la décennie

In [None]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [None]:
# Exemple de fichiers
files[:5]

In [None]:
texts = [open(data_path + f).read() for f in files]

In [None]:
# Exemple de textes
texts[0][:400]

# Vectorisation du texte

In [None]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [None]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [None]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

In [None]:
tfidf_vectors

In [None]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

# Comprendre les vecteurs et leurs "distances"

In [None]:
from scipy.spatial.distance import cosine

In [None]:
cosine([1, 2, 3], [1, 2, 3])

In [None]:
cosine([1, 2, 3], [1, 2, 2])

In [None]:
cosine([1, 2, 3], [2, 2, 2])

In [None]:
tfidf_array = tfidf_vectors.toarray()

In [None]:
tfidf_array[0]

In [None]:
texts[0][:200]

In [None]:
tfidf_array[0]

In [None]:
cosine(tfidf_array[0], tfidf_array[1])

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [None]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [None]:
km_model.fit(tfidf_vectors)

In [None]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [None]:
pprint(dict(clustering))