In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Wczytanie danych

In [None]:
#ramka danych ze słowawmi
df = pd.read_csv('data.csv')
df.head()

## Skalowanie ramki za pomocą TF IDF

In [None]:
cols = df.columns
texts = [''] * len(df)
for i in range(len(df)):
    t = texts[i]
    tmp_num = np.array(df.iloc[i])
    for j in range(len(tmp_num)):
        w = int(tmp_num[j])
        for k in range(w): t = t + ' ' + cols[j]
    texts[i] = str(t)
    #print(texts[i])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, use_idf=True, stop_words='english', token_pattern=r"\b[^\d\W]+\b")

tfidf = tfidf_vectorizer.fit_transform(texts)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

df_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))

In [None]:
df_tfidf

## Wczytanie i standaryzacja statystyk tekstów

In [None]:
#ramka danych ze statystykami tesktów
stats = pd.read_csv('stats_df.csv')
stats = stats.drop(['Unnamed: 0', 'index', 'text'], axis = 1)
stats.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(stats)
stat_scale = scaler.transform(stats)

stats_scale = pd.DataFrame(stat_scale, columns = stats.columns)
stats_scale.head(3)

## Stworzenie zbioru do klasteryzacji

In [None]:
X = pd.merge(stats_scale.reset_index(), df_tfidf.reset_index(), on = 'index').drop('index', axis = 1)
X.head()

## Stworzenie ramki z odpowiedziami

In [None]:
Y = pd.read_csv('AllBooks_baseline_DTM_Labelled.csv')[['Unnamed: 0']]
Y['label'] = Y['Unnamed: 0'].apply(lambda x: x.split('_')[0])


def add_religion(label):
  if label == "Buddhism": return "Buddhism"
  elif label == "TaoTeChing": return "Taoism"
  elif (label == "Upanishad") | (label =="YogaSutra"): return "Hindusim"
  else: return "Old testament"

    
Y['rel'] = Y['label'].apply(lambda x : add_religion(x))
Y = Y.drop('Unnamed: 0', axis = 1)
Y

# Klasteryzacja bez redukcji wymiarów

## Wyznaczenie liczby klastrów

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering

In [None]:
# metdoda łokcia dla KMeans

def KMeansElbow(X, k_max):
    #  WCSS = within-cluster sum of squares
    scores = []
    for k in range(1, k_max+1):
        model = KMeans(n_clusters=k, random_state=0)
        model.fit(X)
        wcss = model.score(X) * -1 # score returns -WCSS
        scores.append(wcss)
    x_ticks = list(range(1, len(scores) + 1))
    plt.plot(x_ticks, scores, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Within-cluster sum of squares')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [None]:
KMeansElbow(X, 10)

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, rand_score, adjusted_mutual_info_score, mutual_info_score

In [None]:
#metoda silhouette

def silhouetteClusterNum(X, cluster_num, score_fun):        
    scores = []    
    for k in range(2, cluster_num+1):
        model_instance = KMeans(n_clusters=k, random_state=0)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    f = plt.figure(figsize=[8, 6])
    plt.plot(range(2, cluster_num+1), scores, 'bx-')
    plt.xlabel('k')
    plt.ylabel(f'{score_fun}')
    plt.show()

In [None]:
silhouetteClusterNum(X, 10, silhouette_score)
#im większy wynik tym lepiej

In [None]:
silhouetteClusterNum(X, 10, davies_bouldin_score)
#im mniejszy wybik tym lepiej

Biorąc pod uwagę wyniki różnych metryk sprawdzimy podział na 2, 3, 4 i 5 klastrów.

## Stworzenie ramek z redukcją wymiarów

### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(X)

plt.figure(figsize=(9,6))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), np.cumsum(pca.explained_variance_ratio_),marker='x')
plt.xlabel('number of components')
plt.xlim(0, 40)
plt.ylabel('cumulative explained variance')

dla 3 zmiennych mamy 85% wariancji

In [None]:
X_pca2 = PCA(n_components=2).fit_transform(X)
X_pca2 = pd.DataFrame({'x': X_pca2[:, 0], 'y': X_pca2[:, 1], 'label': Y['label'], 'rel': Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_pca2, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_pca2, x='x', y='y', hue='rel', ax = ax2)
plt.show()

In [None]:
import plotly.graph_objs as go
from sklearn import preprocessing

X_pca3 = PCA(n_components=3).fit_transform(X)
le = preprocessing.LabelEncoder()

Scene = dict(xaxis = dict(title  = 'PC1'),yaxis = dict(title  = 'PC2'),zaxis = dict(title  = 'PC3'))
labels = le.fit_transform(Y['label'])
trace = go.Scatter3d(x=X_pca3[:,0], y=X_pca3[:,1], z=X_pca3[:,2], mode='markers',marker=dict(color = labels, size = 10, line = dict(color = 'gray',width = 5)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene, height = 600,width = 600)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()

In [None]:
Scene = dict(xaxis = dict(title  = 'PC1'),yaxis = dict(title  = 'PC2'),zaxis = dict(title  = 'PC3'))
labels = le.fit_transform(Y['rel'])
trace = go.Scatter3d(x=X_pca3[:,0], y=X_pca3[:,1], z=X_pca3[:,2], mode='markers',marker=dict(color = labels, size = 10, line = dict(color = 'gray',width = 5)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene, height = 600,width = 600)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()

## Klasteryzacja bez redukcji wymiarów, ale zwizualizowana na PCA

In [None]:
def KMeansClustering(data, reduction, actual_labels):
    results = pd.DataFrame(columns = ['clusters', 'silhouette_score', 'davies_bouldin_score',
                                      'rand_score', 'adjusted_mutual_info_score', 'mutual_info_score'])
    
    fig, axs = plt.subplots(1, 4, figsize = (18, 5))

    for i in range(2, 6):
        kmeans = KMeans(n_clusters=i, random_state=0)
        kmeans.fit(data)
        y_kmeans = kmeans.predict(data)
        
        i_results = pd.DataFrame({'clusters':[i],
                                  'silhouette_score':[silhouette_score(data, y_kmeans)],
                                  'davies_bouldin_score':[davies_bouldin_score(data, y_kmeans)],
                                  'rand_score':[rand_score(actual_labels, y_kmeans)],
                                  'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, y_kmeans)],
                                  'mutual_info_score':[mutual_info_score(actual_labels, y_kmeans)]})
        results = pd.concat([results, i_results])

        sns.scatterplot(data = reduction, x = 'x', y = 'y',
                        hue = y_kmeans, legend = False,
                        ax = axs[i-2])
        ax1.set_title(f'{i} clusters')
        
    
    plt.show()
    return results

In [None]:
KMeansClustering(X, X_pca2, Y['label'])

In [None]:
def AggClustering(data, reduction, actual_labels):
    results = pd.DataFrame(columns = ['clusters', 'linkage', 'silhouette_score', 'davies_bouldin_score',
                                     'rand_score', 'adjusted_mutual_info_score', 'mutual_info_score'])
    
    fig, axs = plt.subplots(3, 4, figsize = (18, 15))
    linkage = ['ward', 'complete', 'single']

    for j in range(3):
        for i in range(2, 6):
            aggClus = AgglomerativeClustering(n_clusters = i, linkage = linkage[j])
            y_aggClus = aggClus.fit_predict(data)
            
            i_results = pd.DataFrame({'clusters':[i],
                                  'linkage':[linkage[j]],    
                                  'silhouette_score':[silhouette_score(data, y_aggClus)],
                                  'davies_bouldin_score':[davies_bouldin_score(data, y_aggClus)],
                                  'rand_score':[rand_score(actual_labels, y_aggClus)],
                                  'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, y_aggClus)],
                                  'mutual_info_score':[mutual_info_score(actual_labels, y_aggClus)]})
            results = pd.concat([results, i_results])

            sns.scatterplot(data = reduction, x = 'x', y = 'y',
                            hue = y_aggClus, legend = False,
                            ax = axs[j, i-2])
            axs[j, i-2].set_title(f'{i} clusters, {linkage[j]} linkage')

    plt.show()
    return results

In [None]:
AggClustering(X, X_pca2, Y['label'])

## Klastrowanie po PCA

In [None]:
KMeansElbow(X_pca2[['x','y']], 10)

In [None]:
silhouetteClusterNum(X_pca2[['x','y']], 10, silhouette_score)


In [None]:
silhouetteClusterNum(X_pca2[['x','y']], 10, davies_bouldin_score)

Po PCA dalej wygląda na to, że będziemy szukac tej samej liczby klastrów

In [None]:
KMeansClustering(X_pca2[['x', 'y']], X_pca2, Y['label'])

In [None]:
AggClustering(X_pca2[['x', 'y']], X_pca2, Y['label'])

# T-SNE

In [None]:
from sklearn.manifold import TSNE

plt.figure(figsize=[10, 8])
tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X)
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_tsne, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_tsne, x='x', y='y', hue='rel', ax = ax2)
plt.show()

## Klasteryzacja bez redukcji wymiarów, wizualizacja na T-SNE

In [None]:
KMeansClustering(X, X_tsne, Y['label'])

In [None]:
AggClustering(X, X_tsne, Y['label'])

## Klasteryzacja po T-SNE

In [None]:
KMeansClustering(X_tsne[['x', 'y']], X_tsne, Y['label'])

In [None]:
AggClustering(X_tsne[['x', 'y']], X_tsne, Y['label'])