In [9]:
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lorenzo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

Implementazione TFIDF

In [46]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
import string
import re

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, document):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        for t in word_tokenize(document):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma):
                lemmas.append(lemma)
    
        return lemmas

Trovo tutte le directory dei documents

In [47]:
rel_path = "data_sets/T-newsgroups/"
documents = [os.path.join(rel_path, d) for d in sorted(os.listdir(rel_path), key=int)]

Tokenizzo tutte le parole dei documenti tramite il LemmaTokenizer eliminando le stopwords

In [48]:
stopwords = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 'could', 'doe', 'ha', 'might', 'must', "n't", 'need', 'sha', 'wa', 'wo', 'would']
lemmaTokenizer = LemmaTokenizer()
# Trasforma le liste di token in numeri che vengono inseriti in una matrice di numpy float in modo da poterli analizzare
vectorizer = TfidfVectorizer(input="filename",tokenizer=lemmaTokenizer, stop_words=stopwords)
tfidf_X = vectorizer.fit_transform(documents)
tfidf_X

<4000x40684 sparse matrix of type '<class 'numpy.float64'>'
	with 357446 stored elements in Compressed Sparse Row format>

PRINCIPAL COMPONENT ANALYSIS (PCA)

In [57]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2000, random_state=42)
svd_X = svd.fit_transform(tfidf_X)
print(f"Total variance explained: {np.sum(svd.explained_variance_ratio_):.2f}")
np.shape(svd_X)

Total variance explained: 0.86


(4000, 2000)

In [50]:
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
def generate_wordclouds(X, X_tfidf, k, word_positions):
    """Cluster X with K-means with the specified k, and generate one wordcloud per cluster.
    :input X: numpy.array or numpy.matrix to cluster
    :input X_tfidf: sparse matrix with TFIDF values
    :input k: the k to be used in K-means
    :input word_positions: dictionary with pairs as word index (in vocabulary) -> word
    :return cluster_ids: set with the clusters ids
    """
    model = KMeans(n_clusters=k, random_state=42, n_jobs=-1)
    y_pred = model.fit_predict(X)
    cluster_ids = set(y_pred)
    top_count = 100
    for cluster_id in cluster_ids:
        # compute the total tfidf for each term in the cluster
        tfidf = X_tfidf[y_pred == cluster_id]
        tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
        tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape(1, X.shape[1])
        top_indices = tfidf_sum.argsort()[-top_count:]
        
        term_weights = {word_positions[idx]: tfidf_sum[idx] for idx in top_indices}
        wc = WordCloud(width=1200, height=800, background_color="white")
        wordcloud = wc.generate_from_frequencies(term_weights)
        fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
        ax.imshow(wordcloud, interpolation='bilinear')
        
        ax.axis("off")
        fig.suptitle(f"Cluster {cluster_id}")
    return cluster_ids

In [58]:
word_positions = {v: k for k, v in vectorizer.vocabulary_.items()}
#cluster_ids = generate_wordclouds(svd_X, tfidf_X, 4, word_positions)

In [59]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth

model = KMeans(n_clusters=4, random_state=42, n_jobs=-1)
y_pred = model.fit_predict(svd_X)
cluster_ids2 = set(y_pred)
min_support = 0.3
dist_words = sorted([v for k,v in word_positions.items()])

for id in cluster_ids2:
    print(f"FP-Growth results on Cluster {id} with min support {min_support}")
    
    tfidf = tfidf_X[y_pred == id]
    # to create a dataframe needs a matrix with just 0 (if absent) and 1 (if present)
    tfidf[tfidf > 0] = 1
    df = pd.DataFrame.sparse.from_spmatrix(tfidf, columns=dist_words)
    fset = fpgrowth(df, min_support, use_colnames=True).sort_values(by='support', ascending=False)
    print(fset)
    

FP-Growth results on Cluster 0 with min support 0.3
     support           itemsets
4   0.649874            (space)
2   0.619647           (writes)
3   0.574307          (article)
13  0.511335  (writes, article)
5   0.380353             (like)
8   0.360202             (also)
9   0.345088             (much)
12  0.345088    (space, writes)
6   0.324937            (orbit)
14  0.324937   (space, article)
0   0.319899           (system)
1   0.317380          (shuttle)
7   0.317380             (year)
11  0.304786            (think)
10  0.302267             (time)
FP-Growth results on Cluster 1 with min support 0.3
     support                 itemsets
0   0.669675                 (writes)
1   0.617329                   (game)
2   0.588448                (article)
16  0.555957        (writes, article)
12  0.509025                   (year)
3   0.447653                   (last)
4   0.440433                   (team)
5   0.398917                  (think)
15  0.391697           (writes, game)
6   