In [None]:
# ! pip install gensim
# ! pip install np
# ! pip install nltp
# ! pip install enchant
# ! pip install seaborn

In [None]:
import nltk
import numpy as np
import enchant
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')      
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

In [None]:
data = []
# ścieżka do plików
p = os.path.join(os.getcwd(), 'data')
for folder in os.listdir(p):
    path = os.path.join(p, folder)
    # jeśli plik to pomiń
    if os.path.isfile(path):
        continue
    # jesli folder to pobierz pliki
    mails = os.listdir(path)
    for mail in mails:
        path = os.path.join(p, folder, mail)
        with open(path, encoding="latin-1") as file:
            text = file.read()
        data.append([text, folder])
df = pd.DataFrame(data)

In [None]:
df

In [None]:
from nltk.corpus import wordnet
# używaliśmy tego w różnych wersjach preprocesingu ostatecznie porzuciliśmy
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].lower()
    tag_dict = {"a": wordnet.ADJ,
                "n": wordnet.NOUN,
                "v": wordnet.VERB,
                "r": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
def Preprocesing(data, lematize = False, stem = True, eng = False):
    # Najlepsze wyniki były dla samego stemmingu więc jest ustawiony domyślnie na True reszta False
    
    # Bierzemy wszystko po pierwszych 2 enterach czyli po opisie technicznym
    formated = data.split('\n\n')[1:]
    formated = ' '.join(formated).lower() # zamiana na małe litery
    
    # Wyrzucamy wszystkie wyrażenia słowo.słowo(ścieżki itp.) i adresy mail 
    formated = re.sub('((\w+\.)+\w+)|\w+@\w+','', formated)
    
    # Podmieniamy wszystkie liczby, ciągi liczb na spacje  i "_"
    formated =re.sub('\W|\d|_'," ", formated)
    
    # Tokenizacja
    tokenizer = RegexpTokenizer(r"\w+")
    formated= tokenizer.tokenize(formated)
    
    # Usuwamy stop wordsy
    stop_words = set(stopwords.words('english'))
    formated = [word for word in formated if (not word in stop_words) and len(word) > 1]
    
    # Lematyzacja
    if(lematize == True):
        lemmatizer = WordNetLemmatizer()
        formated = [lemmatizer.lemmatize(word, pos = get_wordnet_pos(word)) for word in formated] 
    
    # Stemming    
    if (stem == True):
        ps = PorterStemmer() 
        formated = [ps.stem(word) for word in formated] 
    
    if (eng == True):
        d = enchant.Dict("en_US")
        formated = [word for word in formated if d.check(word)]
        
    return formated

In [None]:
df = pd.DataFrame(data)

X_train, X_test, y_train, y_test = train_test_split(
    df[0],
    df[1],
    test_size=0.4, random_state=213)

# Preprocesing
X_train = [Preprocesing(text) for text in X_train]
X_train

In [None]:
def Tf_idf(data, max_t = 0.2, min_t = 0.01, only_short = 'True'):
    # obliczamy tf_idf i usuwamy słowa które występują w mniej niż 1% tesktów i częściej niż w 20%
    tf_idf = [" ".join(text) for text in data]
    vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', max_df = max_t,min_df=min_t)
    tf_idf = vectorizer.fit_transform(tf_idf)
    tf_idf = pd.DataFrame(tf_idf.toarray(), columns = vectorizer.get_feature_names_out())
    return tf_idf, vectorizer

In [None]:
true_k = 6
X, vectorizer = Tf_idf(X_train, 1/true_k, 0.05)

In [None]:
X

In [None]:
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components = 20)
model.fit(X)
# Transform the articles: nmf_features
nmf_features = model.transform(X)


In [None]:
from sklearn.cluster import KMeans
km=KMeans(n_clusters = true_k)
km.fit(nmf_features)
# Calculate the cluster labels: labels
labels = km.predict(nmf_features)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'category': y_train})
pd.crosstab(df['label'], df['category']).T

In [None]:
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters = 6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)

In [None]:
pipeline.fit(X)
# Calculate the cluster labels: labels
labels = pipeline.predict(X)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'category': y_train})
pd.crosstab(df['label'], df['category']).T

In [None]:
km=KMeans(n_clusters = 6)
km.fit(X)
# Calculate the cluster labels: labels
labels = km.predict(X)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'category': y_train})
pd.crosstab(df['label'], df['category']).T

## Jeszcze cieżko stwierdzić co działa najlepiej więc odpalimy na samym tf_idf

In [None]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN().fit(X)

In [None]:
plt.figure(figsize=(14,8)).clf()
sns.histplot(clustering.labels_)

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters =  true_k, random_state = 0)
km.fit(X)
labels = km.labels_

In [None]:
plt.figure(figsize=(14,8)).clf()
sns.histplot(labels)

## Najważniejsze słowa wokół których są środki klastrów umiejscowione. Metaforycznie bo to nadal kilkaset wymiarów

In [None]:
centroids = km.cluster_centers_.argsort()[:, ::-1] ## Indices of largest centroids' entries in descending order
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
# A w praktyce wygląda to tak:
def count_clustering_scores(X, cluster_num, model, score_fun):
    # Napiszmy tę funkcje tak ogólnie, jak to możliwe. 
    # Zwróćcie uwagę na przekazanie obiektów typu callable: model i score_fun.
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores

In [None]:
cluster_num_seq = range(2, 60) # Niektóre metryki nie działają gdy mamy tylko jeden klaster
silhouette_vec = count_clustering_scores(X, cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.show()

In [None]:
def count_wcss_scores(X, k_max):
    #  WCSS = within-cluster sum of squares
    scores = []
    for k in range(1, k_max+1):
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(X)
        wcss = kmeans.score(X) * -1 # score returns -WCSS
        scores.append(wcss)
    return scores

In [None]:
wcss_vec = count_wcss_scores(X, 60)
x_ticks = list(range(1, len(wcss_vec) + 1))
plt.plot(x_ticks, wcss_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Within-cluster sum of squares')
plt.title('The Elbow Method showing the optimal k')
plt.show()