# Clustering
## Import de librerias

In [None]:
import silhouette_mod
import utils
from tabulate import tabulate
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors

## Lectura de dataset

In [None]:
df = utils.leer_dpto('SOAD')
df_menus = utils.leer_menus_labels("modified-menus", 1)
df_menus = df_menus[df_menus["prospecto"] == 1]
df_menus.drop_duplicates(subset="OracionLematizada", keep="first", inplace=True)
df_menus['idx'] = [i for i in range(0, len(df_menus))]
print(f"There are {len(df)} items in df")
print(f"There are {len(df_menus)} items in df_menus")

In [None]:
df.head()

## Vectorize

In [None]:
MIN_DF = 2
MAX_DF = 0.95
MAX_FEATURES = 500
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
X_text = tfidf.fit_transform(df['OracionLematizada'].values)
print(f"dtm shape: {X_text.shape}")

In [None]:
MIN_DF = 0.01
MAX_DF = 0.95
MAX_FEATURES = 500
#tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
#cvtext = CountVectorizer(min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES)
cvtext = CountVectorizer()
X_text = cvtext.fit_transform(df['Cuerpo'].values)
print(f"dtm shape: {X_text.shape}")

## Dimension reduction using TruncatedSVD

We create a full svd in order to be removing a feature one by one

In [None]:
use_svd = True
expected_variance = 0.90

if use_svd:
    full_svd = TruncatedSVD(n_components=X_text.shape[1]-1)
    X_full_svd = full_svd.fit(X_text)
    full_svd_ratios = full_svd.explained_variance_ratio_
    n_components = utils.select_n_components(full_svd_ratios, expected_variance)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_2d = lsa.fit_transform(X_text)

    print(f"original components: {X_text.shape[1]-1}")
    print(f"original ratio: {round(sum(full_svd_ratios), 4)}")
    print(f"expected variance: {expected_variance}")
    print(f"X_2d shape: {X_2d.shape}")
    print(f"X_2d variance: {round(sum(svd.explained_variance_ratio_), 4)}")
else:
    X_2d = X_text.copy()
    print(f"X_2d shape: {X_2d.shape}")

## KMEANS

### Silhouette visualization

In [None]:
%%time
plot_elbow = False

if plot_elbow:
    clusters_range = range(50, 1050, 50)
    visualizer = silhouette_mod.KElbowVisualizer(KMeans(random_state=42), metric='silhouette', k=clusters_range)
    visualizer.fit(X_2d)
    visualizer.poof()

### Clustering

In [None]:
%%time
n_clusters_kmeans = 100
kmeans = KMeans(n_clusters=n_clusters_kmeans, random_state=42)
intents = kmeans.fit_transform(X_2d)
df['cluster'] = kmeans.labels_
print(f"silhouette score: {silhouette_score(X_2d, kmeans.labels_, sample_size=1000, random_state=42)}")

### Represenative centroids of clusters

In [None]:
representative_intents_idx = np.argmin(intents, axis=0)
representative_intents = X_text[representative_intents_idx]
representative = df.iloc[representative_intents_idx].sort_values(by="cluster").copy()

### Text recommendation message to chatbot option (representative dataframe)

In [None]:
%%time
k = 1
list_documents = df_menus['OracionLematizada'].values
list_query = representative['OracionLematizada'].values

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
docs_tfidf = vectorizer.fit_transform(list_documents)
index_top_k, value_top_k = utils.get_tf_idf_query_similarity(vectorizer, docs_tfidf, list_query, k)
# case k = 1
if k == 1:
    index_top_k = [idx[0] for idx in index_top_k]
    value_top_k = [val[0] for val in value_top_k]

recommendation_df = pd.DataFrame()
recommendation_df['message'] = representative['OracionLematizada'].values
recommendation_df['cluster'] = representative['cluster'].values
index_top_k_list = [idx if val > 0 else -1 for idx, val in zip(index_top_k, value_top_k)]
recommendation_df['index_top_k'] = index_top_k_list
value_top_k_list = [val if val > 0 else 0 for val in value_top_k]
recommendation_df['value_top_k'] = value_top_k_list
recommendation_df['menu_top_k'] = [df_menus[df_menus['idx'] == idx]['OracionLematizada'].values[0] if idx >= 0 else "-NA-" for idx in recommendation_df['index_top_k']]
recommendation_df.to_csv("recommendation_representative.csv", index=False)
recommendation_df.head()

### Stats for representative messages

In [None]:
means = recommendation_df.groupby('index_top_k').mean()['value_top_k']
counts = recommendation_df.groupby('index_top_k').count()['cluster']
idxs = recommendation_df.groupby('index_top_k').count().reset_index()['index_top_k'].values

idxs_cons = []
means_cons = []
counts_cons = []
for i in range(-1, len(df_menus)):
    if i in idxs:
        idxs_cons.append(i)
        means_cons.append(means[i])
        counts_cons.append(counts[i])
    else:
        idxs_cons.append(i)
        means_cons.append(0)
        counts_cons.append(0)
df_representative = pd.DataFrame(list(zip(idxs_cons, counts_cons, means_cons)), columns=['idx', 'count', 'mean'])
df_representative.to_csv("recommendation_representative_stats.csv", index=False)
df_representative

### Text recommendation message to chatbot option (prospect dataframe)

In [None]:
%%time
k = 1
list_documents = df_menus['OracionLematizada'].values
list_query = df['OracionLematizada'].values

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
docs_tfidf = vectorizer.fit_transform(list_documents)
index_top_k, value_top_k = utils.get_tf_idf_query_similarity(vectorizer, docs_tfidf, list_query, k)
# case k = 1
if k == 1:
    index_top_k = [idx[0] for idx in index_top_k]
    value_top_k = [val[0] for val in value_top_k]

recommendation_df = pd.DataFrame()
recommendation_df['message'] = df['OracionLematizada'].values
recommendation_df['cluster'] = df['cluster'].values
index_top_k_list = [idx if val > 0 else -1 for idx, val in zip(index_top_k, value_top_k)]
recommendation_df['index_top_k'] = index_top_k_list
value_top_k_list = [val if val > 0 else 0 for val in value_top_k]
recommendation_df['value_top_k'] = value_top_k_list
recommendation_df['menu_top_k'] = [df_menus[df_menus['idx'] == idx]['OracionLematizada'].values[0] if idx >= 0 else "-NA-" for idx in recommendation_df['index_top_k']]
recommendation_df.columns = ['message', 'cluster', 'idx', 'score', 'menu_message']
recommendation_df.to_csv("recommendation_whole.csv", index=False)
recommendation_df.head()

### Stats for whole messages

In [None]:
grouped = recommendation_df.groupby(['cluster', 'idx']).mean().reset_index()
grouped['count'] = recommendation_df.groupby(['cluster', 'idx']).count().reset_index()['score']

grouped.to_csv("recommendation_whole_cluster_stats.csv", index=False)
grouped

In [None]:
means = recommendation_df.groupby('idx').mean()['score']
counts = recommendation_df.groupby('idx').count()['cluster']
idxs = recommendation_df.groupby('idx').count().reset_index()['idx'].values

idxs_cons = []
means_cons = []
counts_cons = []
for i in range(-1, len(df_menus)):
    if i in idxs:
        idxs_cons.append(i)
        means_cons.append(means[i])
        counts_cons.append(counts[i])
    else:
        idxs_cons.append(i)
        means_cons.append(0)
        counts_cons.append(0)
df_whole = pd.DataFrame(list(zip(idxs_cons, counts_cons, means_cons)), columns=['idx', 'count', 'mean'])
df_whole.to_csv("recommendation_whole_stats.csv", index=False)
df_whole

In [None]:
# df_menus[df_menus['idx'].isin(df_whole[df_whole['count'] <= 5]['idx'].values)]

## Analysis on Adjusted Rand Index of K Means against estimated true labels

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
list_score = []
list_arr = []
list_count = []
#list_score.append(0)
#list_arr.append(adjusted_rand_score(recommendation_df['cluster'], recommendation_df['idx']))
#list_count.append(len(recommendation_df))
for score in np.linspace(0, 0.8, 50):
    list_score.append(score)
    list_arr.append(adjusted_rand_score(recommendation_df[recommendation_df['score'] >= score]['cluster'], recommendation_df[recommendation_df['score'] >= score]['idx']))
    list_count.append(len(recommendation_df[recommendation_df['score'] >= score]['cluster']))

print(tabulate(pd.DataFrame(list(zip(list_score, list_arr, list_count)), columns=['score', 'adjrandind', 'count']), headers=['similarity score', 'adjrandind', 'count'], tablefmt='pretty'))

In [None]:
fig = plt.gcf()
fig.set_size_inches(20, 13)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Similarity Score', fontsize=18)
plt.ylabel('AdjRandIndex', fontsize=16)
df_to_plot = pd.DataFrame(list(zip(list_arr, list_score)), columns=['AdjRandIndex', 'Score'])
sns.lineplot(data = df_to_plot, x='Score', y='AdjRandIndex', linewidth = 3)

In [None]:
score_to_filter = round(0.0000001, 6)
super_list = recommendation_df[recommendation_df['score'] > score_to_filter].copy()
super_list_grouped = super_list.groupby('idx').count()[['cluster']]
missing_list = [i for i in range(0, len(df_menus)) if i not in super_list_grouped.index.values]
print(f"similarityscore used for filtering: {score_to_filter}")
print(f"elements in filtered list: {len(super_list)}")
print(f"percentage of filtered elements from original dataframe: {round(len(super_list) / len(df) * 100, 6)}%")
print(f"")
print(f"missing indexes")
print(tabulate(pd.DataFrame(df_menus[df_menus['idx'].isin(missing_list)][['idx', 'TEXTO', 'OracionLematizada']], columns=['idx', 'TEXTO', 'OracionLematizada']), showindex=False, headers=['idx', 'texto', 'oracion lematizada'], tablefmt='pretty'))
print(f"")
print(f"top 10 intents by count")
print(tabulate(super_list_grouped.sort_values(by="cluster", ascending=False).join(df_menus.set_index('idx')).reset_index()[['idx', 'TEXTO', 'cluster']].head(10), showindex=False, headers=['idx', 'texto', 'count'], tablefmt='pretty'))
print(f"")
print(f"brief comparison of clustered text to the intent")
print(tabulate(super_list.set_index('idx').join(df_menus.set_index('idx')).reset_index()[['idx', 'message', 'TEXTO']].sample(10), showindex=False, headers=['idx', 'texto prospecto', 'texto intent'], tablefmt='pretty'))

Similarity cluster 1vs1 on intents

- Hallar cuantos intents de los 54 se cubren bajo un cierto umbral de similitud
- Denotar intents repetidos
- Obtener un precision de # intents encontrados / 54 (total de intents)
- Ver que se podria cambiar para mejorar

In [None]:
df.head()

In [None]:
def run_cluster_analysis(
    df, cluster_label, text_label, menus, tfidf, total, filter_zeros
):
    # MAIN FUNCTION
    with tqdm(
        total=total, bar_format="{bar}|{desc}{percentage:3.0f}% {r_bar}", leave=False
    ) as pbar:
        list_cluster = []
        list_intent = []
        list_intent_text = []
        list_mean = []
        list_count = []
        for cluster in range(0, total):
            list_documents = menus["OracionLematizada"].values
            list_query = df[df[cluster_label] == cluster][text_label].values
            docs_tfidf = tfidf.fit_transform(list_documents)
            query_tfidf = tfidf.transform(list_query)

            cosineSimilarities = cosine_similarity(docs_tfidf, query_tfidf)
            list_intents_means = [np.mean(sims) for sims in cosineSimilarities]
            list_cluster.append(cluster)
            list_intent.append(np.argmax(list_intents_means))
            list_intent_text.append(
                menus[menus["idx"] == np.argmax(list_intents_means)][
                    "OracionLematizada"
                ].values[0]
            )
            list_mean.append(np.max(list_intents_means))
            list_count.append(len(list_query))

            pbar.update(1)

    df_sim = pd.DataFrame(
        list(zip(list_cluster, list_count, list_intent, list_intent_text, list_mean)),
        columns=["cluster", "count", "intent", "intent text", "mean score"],
    )

    # TABULATE
    print("mean score on cluster argmax")
    print(
        tabulate(
            df_sim,
            showindex=False,
            headers=["cluster", "count", "intent", "intent text", "mean score"],
            tablefmt="pretty",
        )
    )
    print()

    # PLOT AXES
    fig = plt.gcf()
    fig.set_size_inches(20, 13)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=16)
    plt.xlabel("cluster", fontsize=18)
    plt.ylabel("mean score", fontsize=16)
    df_to_plot = df_sim.copy()
    ax = sns.barplot(data=df_to_plot, x="cluster", y="mean score", linewidth=3)
    rango = range(0, total + 10, 10)
    ax.set_xticks(rango)
    for x in rango:
        ax.axvline(x, linestyle="-", color="#7f7f7f", linewidth=0.5)
    plt.show()

    # PLOT HIST
    fig = plt.gcf()
    fig.set_size_inches(20, 13)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel("cluster", fontsize=18)
    plt.ylabel("mean score", fontsize=16)
    ax = sns.histplot(data=df_sim, x="mean score")
    plt.show()

    # PLOT HIST zoomed
    fig = plt.gcf()
    fig.set_size_inches(20, 13)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel("cluster", fontsize=18)
    plt.ylabel("mean score", fontsize=16)
    ax = sns.histplot(data=df_sim[df_sim["mean score"] > 0], x="mean score")
    plt.show()

    # DF INFO
    print()
    print("DF INFO:")
    df_sim.info()
    print("")

    return df_sim

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))
total = n_clusters_kmeans
filter_zeros = False
res = run_cluster_analysis(df, 'cluster', 'OracionLematizada', df_menus, tfidf, total, filter_zeros)

In [None]:
res.head()
res.to_csv("res.csv", index=False)

In [None]:
res.head()

In [None]:
list_intent = pd.Series(list(np.empty(len(df))))
list_intent_text = pd.Series(list(np.empty(len(df))))

for k in range(n_clusters_kmeans):
    list_intent[df['cluster'] == k] = res[res['cluster'] == k]['intent'].values[0]
    list_intent_text[df['cluster'] == k] = res[res['cluster'] == k]['intent text'].values[0]

df['intent'] = list_intent
df['intent text'] = list_intent_text
df.head()
df.to_csv("to_validate.csv")

In [None]:
sample_ref = pd.read_csv("res_sample_sizes.csv")

In [None]:
sample_ref.head()

In [None]:
RANDOM_STATE = 42
for k in range(n_clusters_kmeans):
    temp = df[df['cluster'] == k].sample(n=sample_ref[sample_ref['cluster'] == k]['sample_80_10'].values[0])
    temp['x'] = np.zeros(len(temp))
    temp.to_csv("tf-idf-500-300/samples-per-k/" + str(k) + ".csv", index=False)

In [None]:
list_threshold = []
list_precision = []
list_count = []
list_count_binary = []
for threshold in np.linspace(0, 1, 50):
    val, count_sum, count_binary_sum, precision = utils.run_precision(res, df_menus, threshold, show_table=False)
    list_threshold.append(val)
    list_precision.append(precision)
    list_count.append(count_sum)
    list_count_binary.append(count_binary_sum)
df_precision = pd.DataFrame(list(zip(list_threshold, list_precision, list_count, list_count_binary)), columns=['threshold', 'precision', 'sum count', 'binary count'])
df_precision

In [None]:
samples = df.sample(n=383, random_state=42).index
samples

In [None]:
df['intent_sim'] = -1 * len(df)
df['intent_idx'] = -1 * len(df)
df['intent_sim_overall'] = -1 * len(df)
list_intents_means = []
for k in range(n_clusters_kmeans):
    list_query = df[df['cluster'] == k]['OracionLematizada'].values
    docs_tfidf = tfidf.fit_transform(df_menus['OracionLematizada'].values)
    query_tfidf = tfidf.transform(list_query)
    cosineSimilarities = cosine_similarity(docs_tfidf, query_tfidf)
    list_intents_means = [np.mean(sims) for sims in cosineSimilarities]
    df.loc[df['cluster'] == k, 'intent_sim'] = [x for x in cosineSimilarities[np.argmax(list_intents_means)]]
    df.loc[df['cluster'] == k, 'intent_idx'] = np.argmax(list_intents_means)
    df.loc[df['cluster'] == k, 'intent_sim_overall'] = np.max(list_intents_means)

In [None]:
df.iloc[samples]

In [None]:
validation = df.iloc[samples]

In [None]:
validation['validation'] = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
display(validation[validation['validation'] == 0].describe())
display(validation[validation['validation'] == 1].describe())

In [None]:
validation[(validation['validation'] == 0) & (validation['intent_sim'] > 0.5)]

In [None]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

threshold = 0.383831
measured = validation[validation['intent_sim'] > threshold]

nmf = metrics.normalized_mutual_info_score(validation['intent_idx'].values.tolist(), list(text_propagated))
amf = metrics.adjusted_mutual_info_score(validation['intent_idx'].values.tolist(), list(text_propagated))
ars = metrics.adjusted_rand_score(validation['intent_idx'].values.tolist(), list(text_propagated))

print(nmf)
print(amf)
print(ars)

le = LabelEncoder()
le = le.fit(recommendation_df['similar_menu_label_index'].values.tolist())

true = le.transform(recommendation_df['similar_menu_label_index'].values.tolist())
pred = le.transform(list(text_propagated))
accuracy = metrics.accuracy_score(true, pred)
precision = metrics.precision_score(true, pred, average='weighted')
recall = metrics.recall_score(true, pred, average='weighted')
f1 = metrics.f1_score(true, pred, average='weighted')

print(accuracy)
print(precision)
print(recall)
print(f1)

# Hierarchical

In [None]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(X_2d, method='ward'))

## DBSCAN

### Application of algorithm

In [None]:
MIN_SAMPLES = 10
EPS = 0.075
db = DBSCAN(min_samples=MIN_SAMPLES, eps = EPS, metric="cosine").fit(X_2d)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
df['cluster_DBSCAN'] = labels
df.to_csv("clustering_dbscan.csv", index=False)

In [None]:
menus = df_menus['OracionLematizada'].copy()
tfidf = TfidfVectorizer(ngram_range=(1, 3))
total = n_clusters_
filter_zeros = False
res = utils.run_cluster_analysis(df, 'cluster_DBSCAN', 'OracionLematizada', menus, tfidf, total, filter_zeros)

In [None]:
threshold = 0.05
utils.run_precision(res, df_menus, threshold, show_table=False)

In [None]:
df[df['cluster_DBSCAN'] == np.random.randint(low=0, high=n_clusters_)]

---
# END OF NOTEBOOK

## STATS FOR CLUSTERING
## KMEANS

In [None]:
df = pd.read_csv("clustering_dbscan.csv")
df = df[['cluster', 'idx', 'score']]
counts = df.groupby(['cluster', 'idx']).count()['score'].values
means = df.groupby(['cluster', 'idx']).mean()['score'].values
idxs = df.groupby(['cluster', 'idx']).mean().reset_index()['idx'].values
clusters = df.groupby(['cluster', 'idx']).mean().reset_index()['cluster'].values
df_stats = pd.DataFrame(list(zip(clusters, idxs, counts, means)), columns=['cluster', 'idx', 'count', 'mean'])
df_stats.to_csv("clustering_kmeans_stats.csv", index=False)
df_stats.head()

## DBSCAN

In [None]:
df = pd.read_csv("clustering_dbscan.csv")
df = df[['cluster_DBSCAN', 'idx', 'score']]
counts = df.groupby(['cluster_DBSCAN', 'idx']).count()['score'].values
means = df.groupby(['cluster_DBSCAN', 'idx']).mean()['score'].values
idxs = df.groupby(['cluster_DBSCAN', 'idx']).mean().reset_index()['idx'].values
clusters = df.groupby(['cluster_DBSCAN', 'idx']).mean().reset_index()['cluster'].values
df_stats = pd.DataFrame(list(zip(clusters, idxs, counts, means)), columns=['cluster_DBSCAN', 'idx', 'count', 'mean'])
df_stats.to_csv("clustering_dbscan_stats.csv", index=False)
df_stats.head()

---
# Experimento Spacy

In [None]:
import spacy
nlp = spacy.load('es_core_news_lg')

In [None]:
doc_menus = list(nlp.pipe(df_menus['OracionLematizada'].values))
with tqdm(total = len(df), bar_format='{bar}|{desc}{percentage:3.0f}% {r_bar}', leave=False) as pbar:
    list_idx = []
    list_msg = []
    list_argmax = []
    list_rec = []
    list_max = []
    idx = 0
    for doc1 in nlp.pipe(df['OracionLematizada'].values):
        list2 = []
        for idx2, doc2 in enumerate(doc_menus):
            list2.append(doc1.similarity(doc2))
        list2 = np.array(list2)
        argmax_sim = np.argmax(list2)
        max_sim = np.max(list2)
        list_idx.append(idx)
        list_msg.append(doc1)
        list_argmax.append(argmax_sim)
        list_rec.append(df_menus.iloc[argmax_sim]['OracionLematizada'])
        list_max.append(max_sim)
        idx += 1
        pbar.update(1)

In [None]:
spacy_df = pd.DataFrame()
spacy_df['list_idx'] = list_idx
spacy_df['list_msg'] = list_msg
spacy_df['list_argmax'] = list_argmax
spacy_df['list_rec'] = list_rec
spacy_df['list_max'] = list_max
spacy_df['cos_idx'] = rec_df['idx']
spacy_df['cos_rec'] = rec_df['recommendation']

spacy_df.head()

In [None]:
spacy_df.sample(10)

In [None]:
spacy_df['list_max'].hist()

In [None]:
spacy_df[spacy_df['list_max'] > 0.7].sample(10)