In [25]:
from pathlib import Path
import pandas as pd

In [26]:
# Directorios
base_dir = Path().resolve()

# Rutas de CSVs
path_csv_hist = base_dir.parent / "franco" / "API-connect" / "posts_con_sentimiento_historia.csv"
path_csv_agrupado = base_dir.parent / "franco" / "API-connect" / "analisis_sentimiento_diario.csv"

In [27]:
# Lectura CSVs
df = pd.read_csv(path_csv_hist)
df_hist = df.copy()
df_diario = pd.read_csv(path_csv_agrupado)

In [28]:
df_hist.head()

Unnamed: 0,actor_handle,uri,text,created_at,likes,reposts,replies,text_length,word_count,hour,engagement,date,sentiment,sentiment_vader,interpretacion_sentimiento
0,mclem.org,at://did:plc:skf244z64dfa5yvynksq65ij/app.bsky...,The Executive Branch must cease its illegal im...,2025-04-16 12:27:53.844000+00:00,0,0,0,290,44,12,0,2025-04-16,-0.166667,-0.802,Negativo
1,mclem.org,at://did:plc:skf244z64dfa5yvynksq65ij/app.bsky...,The US President who literally campaigned on a...,2025-04-16 12:27:53.843000+00:00,0,0,1,287,41,12,1,2025-04-16,0.0,-0.8192,Negativo
2,bloomberg.com,at://did:plc:uewxgchsjy4kmtu7dcxa77us/app.bsky...,Harvard's $9 billion battle with Trump risks e...,2025-04-16 12:18:42.398000+00:00,15,4,3,124,17,12,22,2025-04-16,0.0,-0.5719,Negativo
3,theguardian.com,at://did:plc:vovinwhtulbsx4mwfw26r5ni/app.bsky...,My parents holding hands after their assisted ...,2025-04-16 12:18:41.212000+00:00,110,10,3,94,13,12,123,2025-04-16,0.25,0.0,Neutral
4,theguardian.com,at://did:plc:vovinwhtulbsx4mwfw26r5ni/app.bsky...,Doge unemployment ‘fraud’ discoveries are old ...,2025-04-16 12:06:49.417000+00:00,140,54,6,79,12,12,200,2025-04-16,0.1,-0.4404,Negativo


In [29]:
df_diario.head()

Unnamed: 0,date,sentimiento_promedio,interpretacion
0,2024-01-01,0.157135,Positivo
1,2024-01-02,0.112922,Positivo
2,2024-01-03,0.075192,Positivo
3,2024-01-04,0.112786,Positivo
4,2024-01-05,0.051773,Positivo


In [30]:
# Transformaciones
df_diario = df_diario.rename(columns={"date": "fecha", "interpretacion": "interpretacion_promedio"})

df_diario['fecha'] = pd.to_datetime(df_diario['fecha'], utc=True, errors='coerce')
df_hist['created_at'] = pd.to_datetime(df['created_at'], utc=True, errors='coerce')


# recorto horarios
df_diario['fecha'] = df_diario['fecha'].dt.date
df_hist['fecha'] = df_hist['created_at'].dt.date

In [31]:
# Genero df con fechas agrupadas
df_resumen = (
    df_hist.groupby('fecha')['interpretacion_sentimiento']
      .agg(lambda x: x.mode().iloc[0])
      .reset_index()
      .rename(columns={'interpretacion_sentimiento': 'interpretacion_moda'})
)

df_resumen.head()

Unnamed: 0,fecha,interpretacion_moda
0,2024-01-01,Positivo
1,2024-01-02,Negativo
2,2024-01-03,Positivo
3,2024-01-04,Negativo
4,2024-01-05,Negativo


In [32]:
# hago merge entre dfs y calculo campo nuevo
df_comparado = df_resumen.merge(df_diario, on='fecha', how='left')
df_comparado['difiere'] = df_comparado['interpretacion_moda'] != df_comparado['interpretacion_promedio']

df_comparado.head()

Unnamed: 0,fecha,interpretacion_moda,sentimiento_promedio,interpretacion_promedio,difiere
0,2024-01-01,Positivo,0.157135,Positivo,False
1,2024-01-02,Negativo,0.112922,Positivo,True
2,2024-01-03,Positivo,0.075192,Positivo,False
3,2024-01-04,Negativo,0.112786,Positivo,True
4,2024-01-05,Negativo,0.051773,Positivo,True


In [33]:
total_diferencias = df_comparado['difiere'].sum()
porcentaje_diferencias = (total_diferencias / len(df_comparado)) * 100

In [34]:
print(f"Total de diferencias: {total_diferencias}")
print(f"Porcentaje respecto al total: {porcentaje_diferencias:.2f}%")

Total de diferencias: 300
Porcentaje respecto al total: 63.69%
