In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Wczytanie Danych

In [None]:
df = pd.read_csv('data.csv')

data_lab = pd.read_csv('AllBooks_baseline_DTM_Labelled.csv')

cols = df.columns
texts = [''] * len(df)
for i in range(len(df)):
    t = texts[i]
    tmp_num = np.array(df.iloc[i])
    for j in range(len(tmp_num)):
        w = int(tmp_num[j])
        for k in range(w): t = t + ' ' + cols[j]
    texts[i] = str(t)
#    print(texts[i])

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, use_idf=True, stop_words='english', token_pattern=r"\b[^\d\W]+\b")

tfidf = tfidf_vectorizer.fit_transform(texts)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

df_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))

#ramka danych ze statystykami tesktów
stats = pd.read_csv('stats_df.csv')
stats = stats.drop(['Unnamed: 0', 'index', 'text'], axis = 1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(stats)
stat_scale = scaler.transform(stats)

stats_scale = pd.DataFrame(stat_scale, columns = stats.columns)

X = pd.merge(stats_scale.reset_index(), df_tfidf.reset_index(), on = 'index').drop('index', axis = 1)
X.head()

In [None]:
Y = pd.read_csv('AllBooks_baseline_DTM_Labelled.csv')[['Unnamed: 0']]
Y['label'] = Y['Unnamed: 0'].apply(lambda x: x.split('_')[0])

def add_religion(label):
  if label == "Buddhism": return "Buddhism"
  elif label == "TaoTeChing": return "Taoism"
  elif (label == "Upanishad") | (label =="YogaSutra"): return "Hindusim"
  else: return "Old testament"

    
Y['rel'] = Y['label'].apply(lambda x : add_religion(x))
Y = Y.drop('Unnamed: 0', axis = 1)
Y.head()

In [None]:
from sklearn.decomposition import PCA

# PCA

In [None]:
pca = PCA().fit(X)

plt.figure(figsize=(9,6))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.xlim(0, 100)
plt.ylabel('cumulative explained variance');

Dla 3 komponentów mamy wyjaśnione 85% wariancji, 90% jest wyjaśnione przez 45 komponentów.

In [None]:
X_pca45 = PCA(n_components=45).fit_transform(X)

In [None]:
from sklearn.manifold import TSNE

In [None]:
plt.figure(figsize=[10, 8])
tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_pca45)
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_tsne, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_tsne, x='x', y='y', hue='rel', ax = ax2)
plt.show()

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, rand_score, adjusted_mutual_info_score, mutual_info_score

In [None]:
def KMeansClustering(data, reduction, actual_labels):
    results = pd.DataFrame(columns = ['clusters', 'silhouette_score', 'davies_bouldin_score',
                                      'rand_score', 'adjusted_mutual_info_score', 'mutual_info_score'])
    
    fig, axs = plt.subplots(1, 4, figsize = (18, 5))

    for i in range(2, 6):
        kmeans = KMeans(n_clusters=i, random_state=0)
        kmeans.fit(data)
        y_kmeans = kmeans.predict(data)
        
        i_results = pd.DataFrame({'clusters':[i],
                                  'silhouette_score':[silhouette_score(data, y_kmeans)],
                                  'davies_bouldin_score':[davies_bouldin_score(data, y_kmeans)],
                                  'rand_score':[rand_score(actual_labels, y_kmeans)],
                                  'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, y_kmeans)],
                                  'mutual_info_score':[mutual_info_score(actual_labels, y_kmeans)]})
        results = pd.concat([results, i_results])

        sns.scatterplot(data = reduction, x = 'x', y = 'y',
                        hue = y_kmeans, legend = False,
                        ax = axs[i-2], palette='viridis')
        ax1.set_title(f'{i} clusters')
        
    
    plt.show()
    return results

In [None]:
def AggClustering(data, reduction, actual_labels):
    results = pd.DataFrame(columns = ['clusters', 'linkage', 'silhouette_score', 'davies_bouldin_score',
                                     'rand_score', 'adjusted_mutual_info_score', 'mutual_info_score'])
    
    fig, axs = plt.subplots(3, 4, figsize = (18, 15))
    linkage = ['ward', 'complete', 'single']

    for j in range(3):
        for i in range(2, 6):
            aggClus = AgglomerativeClustering(n_clusters = i, linkage = linkage[j])
            y_aggClus = aggClus.fit_predict(data)
            
            i_results = pd.DataFrame({'clusters':[i],
                                  'linkage':[linkage[j]],    
                                  'silhouette_score':[silhouette_score(data, y_aggClus)],
                                  'davies_bouldin_score':[davies_bouldin_score(data, y_aggClus)],
                                  'rand_score':[rand_score(actual_labels, y_aggClus)],
                                  'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, y_aggClus)],
                                  'mutual_info_score':[mutual_info_score(actual_labels, y_aggClus)]})
            results = pd.concat([results, i_results])

            sns.scatterplot(data = reduction, x = 'x', y = 'y',
                            hue = y_aggClus, legend = False,
                            ax = axs[j, i-2], palette='viridis')
            axs[j, i-2].set_title(f'{i} clusters, {linkage[j]} linkage')

    plt.show()
    return results

In [None]:
def GMMClustering(data, reduction, actual_labels):
    results = pd.DataFrame(columns = ['clusters', 'covariance', 'silhouette_score', 'davies_bouldin_score',
                                     'rand_score', 'adjusted_mutual_info_score', 'mutual_info_score'])
    
    fig, axs = plt.subplots(3, 4, figsize = (18, 15))
    cov = ['full', 'tied', 'diag']

    for j in range(3):
        for i in range(2, 6):
            gmm = GaussianMixture(n_components=i, covariance_type=cov[j])
            y_gmm = gmm.fit_predict(data)
            
            i_results = pd.DataFrame({'clusters':[i],
                                  'covariance':[cov[j]],    
                                  'silhouette_score':[silhouette_score(data, y_gmm)],
                                  'davies_bouldin_score':[davies_bouldin_score(data, y_gmm)],
                                  'rand_score':[rand_score(actual_labels, y_gmm)],
                                  'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, y_gmm)],
                                  'mutual_info_score':[mutual_info_score(actual_labels, y_gmm)]})
            results = pd.concat([results, i_results])

            sns.scatterplot(data = reduction, x = 'x', y = 'y',
                            hue = y_gmm, legend = False,
                            ax = axs[j, i-2], palette='viridis')
            axs[j, i-2].set_title(f'{i} clusters, {cov[j]} covarince')

    plt.show()
    return results

In [None]:
a = KMeansClustering(X_pca45, X_tsne, Y['rel'])

In [None]:
a.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
b = AggClustering(X_pca45, X_tsne, Y['rel'])

In [None]:
b.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
c = GMMClustering(X_pca45, X_tsne, Y['rel'])

In [None]:
c.reset_index(drop=True).style.background_gradient(cmap='Blues')

# TruncatedSVD

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [None]:
X_csr = csr_matrix(X)

tsvd = TruncatedSVD(n_components=50)
tsvd.fit(X_csr)
X_tsvd_t = tsvd.transform(X_csr)

In [None]:
plt.figure(figsize=[10, 8])
tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_tsvd_t)
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_tsne, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_tsne, x='x', y='y', hue='rel', ax = ax2)
plt.show()

In [None]:
a = KMeansClustering(X_tsvd_t, X_tsne, Y['rel'])

In [None]:
a.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
b = AggClustering(X_tsvd_t, X_tsne, Y['rel'])

In [None]:
b.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
c = GMMClustering(X_tsvd_t, X_tsne, Y['rel'])

In [None]:
c.reset_index(drop=True).style.background_gradient(cmap='Blues')

# NMF

In [None]:
from sklearn.decomposition import NMF

In [None]:
X_tfidf_csr = csr_matrix(df_tfidf)
nmf = NMF(n_components=8)
X_nmf_t = nmf.fit_transform(X_tfidf_csr)

In [None]:
plt.figure(figsize=[10, 8])
tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_nmf_t)
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_tsne, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_tsne, x='x', y='y', hue='rel', ax = ax2)
plt.show()

In [None]:
a = KMeansClustering(X_nmf_t, X_tsne, Y['rel'])

In [None]:
a.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
b = AggClustering(X_nmf_t, X_tsne, Y['rel'])

In [None]:
b.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
c = GMMClustering(X_nmf_t, X_tsne, Y['rel'])

In [None]:
c.reset_index(drop=True).style.background_gradient(cmap='Blues')

# Sparse PCA

In [None]:
from sklearn.decomposition import SparsePCA

In [None]:
spca = SparsePCA(n_components=45)
X_spca_t = spca.fit_transform(X)


In [None]:
plt.figure(figsize=[10, 8])
tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_spca_t)
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data=X_tsne, x='x', y='y', hue='label', ax = ax1)
sns.scatterplot(data=X_tsne, x='x', y='y', hue='rel', ax = ax2)
plt.show()

In [None]:
a = KMeansClustering(X_spca_t, X_tsne, Y['rel'])

In [None]:
a.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
b = AggClustering(X_spca_t, X_tsne, Y['rel'])

In [None]:
b.reset_index(drop=True).style.background_gradient(cmap='Blues')

In [None]:
c = GMMClustering(X_spca_t, X_tsne, Y['rel'])

In [None]:
c.reset_index(drop=True).style.background_gradient(cmap='Blues')

# Najlepsze klastrowania

In [None]:
 from sklearn.metrics import calinski_harabasz_score

In [None]:
def metrics(data, actual_labels, predict):
    return pd.DataFrame({   
        'silhouette_score':[silhouette_score(data, predict)],
        'davies_bouldin_score':[davies_bouldin_score(data, predict)],
        'rand_score':[rand_score(actual_labels, predict)],
        'adjusted_mutual_info_score':[adjusted_mutual_info_score(actual_labels, predict)],
        'mutual_info_score':[mutual_info_score(actual_labels, predict)],
        'calinski_harabasz_score' :[calinski_harabasz_score(data, predict)]
    }).transpose()

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

### PCA45, 3 klastry, GMM, tied

### PCA45, 4 klastry, GMM, tied

In [None]:
gmm = GaussianMixture(n_components = 4, covariance_type='tied')
X_pca45_gmm_4 = gmm.fit_predict(X_pca45)

tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_pca45)
X_tmp = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': X_pca45_gmm_4})
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data = X_tsne, x = 'x', y = 'y', hue = X_pca45_gmm_4, legend = 'auto', palette='viridis', ax = ax1)
sns.histplot(X_tmp['label'], ax = ax2, palette='viridis')
plt.show()

In [None]:
metrics(X_pca45, Y['rel'], X_pca45_gmm_4)

In [None]:
show_wordcloud(df_tfidf.loc[X_pca45_gmm_4 == 0].sum().sort_values(ascending=False).to_dict())

In [None]:
show_wordcloud(df_tfidf.loc[X_pca45_gmm_4 == 1].sum().sort_values(ascending=False).to_dict())

In [None]:
show_wordcloud(df_tfidf.loc[X_pca45_gmm_4 == 2].sum().sort_values(ascending=False).to_dict())

In [None]:
show_wordcloud(df_tfidf.loc[X_pca45_gmm_4 == 3].sum().sort_values(ascending=False).to_dict())

## TruncatedSVD, 3 klastry, GMM, tied

In [None]:
X_csr = csr_matrix(X)

tsvd = TruncatedSVD(n_components=50)
tsvd.fit(X_csr)
X_tsvd_t = tsvd.transform(X_csr)

In [None]:
gmm = GaussianMixture(n_components = 3, covariance_type='tied')
X_tsvd_gmm_3 = gmm.fit_predict(X_tsvd_t)

tSNE = TSNE(random_state=0, verbose=1)
X_tsne = tSNE.fit_transform(X_tsvd_t)
X_tmp = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': X_tsvd_gmm_3})
X_tsne = pd.DataFrame({'x': X_tsne[:, 0], 'y': X_tsne[:, 1], 'label': Y['label'], 'rel' : Y['rel']})

f, (ax1, ax2) = plt.subplots(1, 2, figsize=[18, 6])
sns.scatterplot(data = X_tsne, x = 'x', y = 'y', hue = X_tsvd_gmm_3, legend = 'auto', palette='viridis', ax = ax1)
sns.histplot(X_tmp['label'], ax = ax2, palette='viridis')
plt.show()

In [None]:
metrics(X_tsvd_t, Y['rel'], X_tsvd_gmm_3)

In [None]:
show_wordcloud(df_tfidf.loc[X_tsvd_gmm_3 == 0].sum().sort_values(ascending=False).to_dict())

In [None]:
show_wordcloud(df_tfidf.loc[X_tsvd_gmm_3 == 1].sum().sort_values(ascending=False).to_dict())

In [None]:
show_wordcloud(df_tfidf.loc[X_tsvd_gmm_3 == 2].sum().sort_values(ascending=False).to_dict())