In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
import textwrap
from mypackage import dir

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [16]:
# Environment variables
project = 'belgium'
data = dir.make_dir(project) 
processed = data('processed')
outputs = data('outputs')

In [17]:
def stratified_sample(df):
    if df.name == 0:  # Si la categoría es 0, toma 30%
        return df.sample(frac=0.3, random_state=42)
    else:  # Para categorías 1 y 2, toma 10%
        return df.sample(frac=0.1, random_state=42) 

def asignar_topicos_pandas(df):
    """
    Versión alternativa usando solo pandas
    """
    # Definir condiciones en orden de prioridad
    conditions = [
        (df['aloja_general'] == 1),
        (df['habitacion'] == 1) & (df['aloja_general'] == 0),
        (df['servicio'] == 1) & (df['aloja_general'] == 0) & (df['habitacion'] == 0),
        (df['comida'] == 1) & (df['aloja_general'] == 0) & 
            (df['habitacion'] == 0) & (df['servicio'] == 0),
        (df['costo'] == 1) & (df['aloja_general'] == 0) & 
            (df['habitacion'] == 0) & (df['servicio'] == 0) & (df['comida'] == 0),
        (df['regreso'] == 1) & (df['aloja_general'] == 0) & 
            (df['habitacion'] == 0) & (df['servicio'] == 0) & 
            (df['comida'] == 0) & (df['costo'] == 0)
    ]
    
    # choices = ['aloja_general', 'habitacion', 'servicio', 'comida', 'costo', 'regreso']
    choices = [1, 2, 3, 4, 5, 6]
    
    df['topico'] = np.select(conditions, choices, default='h')
    return df[df['topico'] != 'h']

def agregar_saltos_linea(texto, ancho=50):
    """
    Versión usando el módulo textwrap de Python para un control más preciso.
    
    Parámetros:
    - texto: str - Texto original
    - ancho: int - Ancho máximo de caracteres por línea
    
    Retorna:
    - str - Texto con saltos de línea HTML
    """
    texto = ' '.join(str(texto).split())  # Normalizar espacios
    return '<br>'.join(textwrap.wrap(texto, width=ancho))

def plot_embeddings(df: pd.DataFrame, title: str, save: bool = False):
    """
    Genera y muestra un grafico 100 palabras

    Args:
        df (pd.DataFrame): Dataframe limpio.
        titulo (str): Título para el gráfico.
        save: Whether to save the visualization to an HTML file
    """
    fig = px.scatter(df, x="x", y="y", 
                     size='importancia',
                     color="id_topic", 
                     symbol='label',
                     hover_data=['texto'],
                     labels=labels,
                     )

    # Customize text positioning (consider clarity and visual balance)
    fig.update_traces(textposition='top center')  # or 'top center' based on preference

    # Template
    fig.update_layout(template = "plotly_dark")
    
    # Informative layout with custom title
    fig.update_layout(
        title_text=f'Cloud <br><sup> {title} </sup>',  # Title
        title_x=0.5,  # Centered title
        xaxis_title=' ',  # Add meaningful axis labels
        yaxis_title=' ',
        font=dict(family="Arial", size=12),  # Consistent font style and size
        # plot_bgcolor='white',  # Clean background color
        xaxis_tickformat='.2f',  # Format x-axis ticks for readability (adjust as needed)
        yaxis_tickformat='.2f',  # Format y-axis ticks for readability (adjust as needed)

        xaxis=dict(
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            ticks='',
            showticklabels=False
        )

    )

    # Interactive elements (optional)
    fig.update_layout(hovermode='closest')  # Hover over a point for details
    fig.update_traces(opacity=0.8)  # Adjust marker opacity for better visibility

    # Advanced styling (optional)
    fig.update_xaxes(showline=False, linewidth=2, linecolor='gray')  # X-axis formatting
    fig.update_yaxes(showline=False, linewidth=2, linecolor='gray')  # Y-axis formatting

    if save:
        fig.write_html(outputs / f'{title}.html'.format('cloud'))

    # Display the enhanced plot
    fig.show()

labels={'words':'Word',
        'group':'Group',
        'id_topic':'Topic',
        'label': 'Label',
        'importancia':'Importancia',
        'texto':'Comentario original',
        'x':'Dim 1',
        'y':'Dim 2',
    }

In [18]:
df = pd.read_parquet(processed/'df_hotel_transformados.parquet.gzip')
df = df.loc[:,['id', 'texto', 'label', 'importancia_tfidf']]
df.head()

Unnamed: 0,id,texto,label,importancia_tfidf
0,1,el mejor lugar para comer sushi. excelente lug...,2,3.833645
1,2,vista hermosa!. me sorprendió la maravillosa v...,2,4.63501
2,3,"desastroso. buenas noches, ante todo explicar...",0,7.507874
3,4,todo bien excepto que se niegan a dar vasos de...,1,3.936999
4,5,el mejor zoologico de méxico. el ambiente es m...,2,3.567779


In [19]:
df_embeddings = pd.read_parquet(processed/'df_embeddings.parquet.gzip')
# df_embeddings = df_embeddings.head(500)
df_embeddings = pd.merge(df_embeddings, df, how='left', on=['id']) 
# df_embeddings = df_embeddings.groupby('label').apply(lambda x: x.sample(frac=0.15)).reset_index(drop=True)
df_embeddings = df_embeddings.groupby('label', group_keys=False).apply(stratified_sample).reset_index(drop=True)
df_embeddings = df_embeddings[df_embeddings['importancia_tfidf'] >= 4]
df_embeddings.head()





Unnamed: 0,id,embeddings,texto,label,importancia_tfidf
2,122612,"[0.02409866452217102, 0.0077154068276286125, 0...","hay cucarachas. hay curachas, el día de hoy es...",0,4.67501
5,5346,"[0.020204950124025345, -0.022348560392856598, ...","muy mal servicio. el lugar es muy bonito, buen...",0,4.799042
6,28423,"[0.0003062793693970889, -0.03732236474752426, ...",no recomendable ya que el hotel es malo malo m...,0,4.188769
9,42021,"[0.006794727873057127, -0.01060000341385603, 0...",decepción. cuando escribes habana en cualquier...,0,6.096193
13,129525,"[-0.013585399836301804, -0.04217841476202011, ...",asco y timo. el hotel carecía de casi todo con...,0,6.340737


In [20]:
df_topicos_manual = pd.read_parquet(processed/'df_topicos_manual.parquet.gzip')
df_topicos_manual = df_topicos_manual.loc[:,['id', 'eval_pos', 'eval_neg', 'aloja_general', 'habitacion', 'servicio', 'comida', 
                                             'limp_pos', 'limp_neg', 'costo', 'regreso',]]
                                             
df_topicos_manual['evaluacion'] = df_topicos_manual[['eval_pos', 'eval_neg']].sum(axis=1)
df_topicos_manual = df_topicos_manual[df_topicos_manual['evaluacion'] == 1]
df_topicos_manual['evalucacion_limpieza'] = df_topicos_manual[['limp_pos', 'limp_neg']].sum(axis=1)
df_topicos_manual = df_topicos_manual[df_topicos_manual['evalucacion_limpieza'] == 1]

df_embeddings = pd.merge(df_embeddings, df_topicos_manual, how='left', on=['id']) 
df_embeddings = df_embeddings.dropna(subset=['eval_pos', 'eval_neg', 'limp_pos', 'limp_neg'])


df_embeddings.insert(5, 'calificacion', 'h')
df_embeddings.insert(6, 'calificacion_limpieza', 'h')
df_embeddings['calificacion'] = np.where(df_embeddings['eval_pos'] == 1, 1, 0)
df_embeddings['calificacion_limpieza'] = np.where(df_embeddings['limp_pos'] == 1, 1, 0)

# Asignar el topico mas importante
df_embeddings = asignar_topicos_pandas(df_embeddings)

df_embeddings.drop(['eval_pos', 'eval_neg', 'limp_pos', 'limp_neg', 'aloja_general', 'habitacion', 'servicio', 
                    'comida', 'costo', 'regreso'], axis=1, inplace=True)
df_embeddings.head()

Unnamed: 0,id,embeddings,texto,label,importancia_tfidf,calificacion,calificacion_limpieza,evaluacion,evalucacion_limpieza,topico
42,43117,"[-0.017055250704288483, -0.06946675479412079, ...","mal servicio del hotel, elevador no funcionaba...",0,4.699726,0,0,1.0,1.0,1
44,88340,"[-0.014535254798829556, -0.03000936470925808, ...",penoso. nos dieron una habitación deplorable.h...,0,4.342254,1,0,1.0,1.0,1
47,28126,"[0.000245657138293609, 0.01085940282791853, -0...",instalaciones del hotel muy deterioradas. nece...,0,4.898652,0,0,1.0,1.0,1
63,37491,"[-0.014410164207220078, -0.009767151437699795,...",el peor hotel en el que me he alojado .... si ...,0,4.188763,0,0,1.0,1.0,1
69,69933,"[0.01075123529881239, -0.020700616762042046, 0...","servicio pésimo, limpieza cuestionable. mal se...",0,4.999484,0,1,1.0,1.0,2


In [21]:
embeddings = np.vstack(df_embeddings["embeddings"].values)  # Convertir en array
centroides = df_embeddings.groupby("topico")["embeddings"].apply(
    lambda x: np.mean(np.vstack(x.values), axis=0)
).to_dict()                                                          # Sacar el promedio de los embeddings por cada topico

# Sacar la distancia de todos los puntos al centroide
distancias = []
for idx, row in df_embeddings.iterrows():
    emb = row["embeddings"]
    topico = row["topico"]
    centroide = centroides[topico]
    distancia = euclidean_distances([emb], [centroide])[0][0]
    distancias.append(distancia)

# Agregar las distancias al dataframe
df_embeddings["distancia_al_centroide"] = distancias
df_embeddings.head()

Unnamed: 0,id,embeddings,texto,label,importancia_tfidf,calificacion,calificacion_limpieza,evaluacion,evalucacion_limpieza,topico,distancia_al_centroide
42,43117,"[-0.017055250704288483, -0.06946675479412079, ...","mal servicio del hotel, elevador no funcionaba...",0,4.699726,0,0,1.0,1.0,1,0.631439
44,88340,"[-0.014535254798829556, -0.03000936470925808, ...",penoso. nos dieron una habitación deplorable.h...,0,4.342254,1,0,1.0,1.0,1,0.568424
47,28126,"[0.000245657138293609, 0.01085940282791853, -0...",instalaciones del hotel muy deterioradas. nece...,0,4.898652,0,0,1.0,1.0,1,0.703024
63,37491,"[-0.014410164207220078, -0.009767151437699795,...",el peor hotel en el que me he alojado .... si ...,0,4.188763,0,0,1.0,1.0,1,0.550413
69,69933,"[0.01075123529881239, -0.020700616762042046, 0...","servicio pésimo, limpieza cuestionable. mal se...",0,4.999484,0,1,1.0,1.0,2,0.531468


In [22]:
descripcion_distancias = df_embeddings.groupby("topico")["distancia_al_centroide"].describe()
descripcion_distancias

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
topico,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,99.0,0.567288,0.122865,0.381042,0.485127,0.534871,0.606739,1.019631
2,13.0,0.580629,0.073197,0.468297,0.531468,0.599531,0.624906,0.732036
3,41.0,0.71634,0.074202,0.591582,0.667627,0.710252,0.759542,0.92934
4,6.0,0.610549,0.061042,0.537445,0.5656,0.603884,0.665338,0.679419
5,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,,0.0,0.0,0.0,0.0,0.0


In [23]:
centroides = df_embeddings.groupby("topico")["embeddings"].apply(
    lambda x: np.mean(np.vstack(x.values), axis=0)
).to_dict()

In [24]:
df_embeddings['label'].value_counts()

label
2    137
1     14
0     10
Name: count, dtype: int64

In [25]:
embedding_matrix = np.vstack(df_embeddings['embeddings'].values)

# # Topicos automaticos
id = df_embeddings['id'].values.tolist()
id_topic = df_embeddings['topico'].values.tolist()
id_importancia = df_embeddings['importancia_tfidf'].values.tolist()
id_label = df_embeddings['calificacion'].values.tolist()
id_texto = df_embeddings['texto'].values.tolist()

In [26]:
pca = PCA(n_components=2)
fm = pd.DataFrame(pca.fit_transform(embedding_matrix), columns=['x','y'])

fm['id'] = id
fm['id_topic'] = id_topic
fm['importancia'] = id_importancia
fm['label'] = id_label
fm['texto'] = id_texto
fm['id_topic'] = fm['id_topic'].astype(str)

# Aplicar la función a la columna de texto
fm['texto'] = fm['texto'].str.capitalize().apply(lambda x: agregar_saltos_linea(x, ancho=60))

# Modificar la importancia para los negativos
fm["importancia"] = np.exp(fm["importancia"])
fm['importancia'] = np.where(fm['label'] == 0, fm['importancia']*2, fm['importancia'])

fm['label'] = np.where(fm['label'] == 0, 'Negative', 'Positive')

fm.head()

Unnamed: 0,x,y,id,id_topic,importancia,label,texto
0,0.156243,0.073063,43117,1,219.834091,Negative,"Mal servicio del hotel, elevador no funcionaba..."
1,0.220769,-0.0516,88340,1,76.880665,Positive,Penoso. nos dieron una habitación deplorable.h...
2,0.283515,-0.0279,28126,1,268.217739,Negative,Instalaciones del hotel muy deterioradas. nece...
3,0.22599,-0.14512,37491,1,131.882372,Negative,El peor hotel en el que me he alojado .... si ...
4,-0.346548,0.323958,69933,2,296.673285,Negative,"Servicio pésimo, limpieza cuestionable. mal se..."


In [27]:
plot_embeddings(fm, title='Embeddings Visualization and Simple Topic Clustering', save=True)

In [28]:
print('ok_')

ok_
