In [None]:
import silhouette_mod
import utils
from tabulate import tabulate
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors

In [None]:
df = utils.leer_dpto('SOAD')
df_menus = utils.leer_menus_labels("modified-menus", 1)
df_menus = df_menus[df_menus["prospecto"] == 1]
df_menus.drop_duplicates(subset="OracionLematizada", keep="first", inplace=True)
df_menus['idx'] = [i for i in range(0, len(df_menus))]
print(f"There are {len(df)} items in df")
print(f"There are {len(df_menus)} items in df_menus")

In [None]:
MIN_DF = 2
MAX_DF = 0.95
MAX_FEATURES = 500
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
X_text = tfidf.fit_transform(df['OracionLematizada'].values)
print(f"dtm shape: {X_text.shape}")

In [None]:
MIN_DF = 2
MAX_DF = 0.95
MAX_FEATURES = 500
#tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
cvtext = CountVectorizer(min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
X_text = cvtext.fit_transform(df['OracionLematizada'].values)
print(f"dtm shape: {X_text.shape}")

In [None]:
use_svd = True
expected_variance = 0.90

if use_svd:
    full_svd = TruncatedSVD(n_components=X_text.shape[1]-1)
    X_full_svd = full_svd.fit(X_text)
    full_svd_ratios = full_svd.explained_variance_ratio_
    n_components = utils.select_n_components(full_svd_ratios, expected_variance)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_2d = lsa.fit_transform(X_text)

    print(f"original components: {X_text.shape[1]-1}")
    print(f"original ratio: {round(sum(full_svd_ratios), 4)}")
    print(f"expected variance: {expected_variance}")
    print(f"X_2d shape: {X_2d.shape}")
    print(f"X_2d variance: {round(sum(svd.explained_variance_ratio_), 4)}")
else:
    X_2d = X_text.copy()
    print(f"X_2d shape: {X_2d.shape}")

In [None]:
from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_components = 10
n_top_words = 20

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(60, 25), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(X_text)

In [None]:
feature_names = tfidf.get_feature_names()
#feature_names = cvtext.get_feature_names()
plot_top_words(nmf, feature_names, n_top_words, 'Topics in NMF model (Frobenius norm)')

In [None]:
docsVStopics = pd.DataFrame(nmf.transform(X_text), columns=["Topic"+str(i+1) for i in range(10)])
print("Created a (%dx%d) document-topic matrix." % (docsVStopics.shape[0], docsVStopics.shape[1]))
most_likely_topics = docsVStopics.idxmax(axis=1)
most_likely_topics.groupby(most_likely_topics).count()

---

In [None]:
nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(X_text)

In [None]:
feature_names = tfidf.get_feature_names()
plot_top_words(nmf, feature_names, n_top_words, 'Topics in NMF model (generalized Kullback-Leibler divergence)')

In [None]:
docsVStopics = pd.DataFrame(nmf.transform(X_text), columns=["Topic"+str(i+1) for i in range(10)])
print("Created a (%dx%d) document-topic matrix." % (docsVStopics.shape[0], docsVStopics.shape[1]))
most_likely_topics = docsVStopics.idxmax(axis=1)
most_likely_topics.groupby(most_likely_topics).count()