In [1]:
# Si te faltan dependencias, descomenta y ejecuta:
# !pip install torch transformers sentencepiece tiktoken protobuf

import json
from transformers import pipeline, logging

# Suprimir warnings de Transformers
logging.set_verbosity_error()

# Forzar uso de CPU (cambia a 0 para GPU si tu VRAM lo permite)
DEVICE = -1

# 1) Configuración de pipelines
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=DEVICE
)
emotion_pipe = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-emotion",
    tokenizer="cardiffnlp/twitter-roberta-base-emotion",
    return_all_scores=True,
    device=DEVICE
)
hate_pipe = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    tokenizer="joeddav/xlm-roberta-large-xnli",
    device=DEVICE
)
translation_pipe = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",
    tokenizer="Helsinki-NLP/opus-mt-mul-en",
    device=DEVICE
)
summarization_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    tokenizer="sshleifer/distilbart-cnn-12-6",
    device=DEVICE
)

# 2) Función de análisis
def analyze_text(text, lang):
    text = text or ""
    if not text.strip():
        return {
            "sentiment": None,
            "emotions": [],
            "hate": None,
            "translation": None,
            "summary": None
        }

    res = {}
    # Sentiment
    s = sentiment_pipe(text)[0]
    res["sentiment"] = {"label": s["label"], "score": float(s["score"])}

    # Emotions top 3
    emos = emotion_pipe(text)[0]
    top3 = sorted(emos, key=lambda x: x["score"], reverse=True)[:3]
    res["emotions"] = [{"label": e["label"], "score": float(e["score"])} for e in top3]

    # Hate vs Not Hate (skip si falla)
    try:
        z = hate_pipe(text, candidate_labels=["hate", "not hate"])
        res["hate"] = {"label": z["labels"][0], "score": float(z["scores"][0])}
    except ValueError:
        res["hate"] = None

    # Traducción a inglés si no es inglés
    if lang != "en":
        tr = translation_pipe(text, max_length=256)[0]["translation_text"]
        res["translation"] = tr
        summary_input = tr
    else:
        res["translation"] = text
        summary_input = text

    # Resumen dinámico (más corto que la entrada)
    tok = summarization_pipe.tokenizer
    ids = tok.encode(summary_input, return_tensors="pt")[0]
    max_len = max(5, min(60, len(ids) - 2))
    sm = summarization_pipe(summary_input, max_length=max_len, min_length=5)[0]["summary_text"]
    res["summary"] = sm

    return res

# 3) Cargar, procesar y guardar JSON mostrando progreso de tweets y replies
input_path = "tweets_españoles_anotados_finales_id.json"   
output_path = "tweets_españoles_evaluados.json"  
lang = "el"  # 'el' para griego, 'es' para español, 'en' para inglés

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

total = len(data)
for idx, item in enumerate(data, start=1):
    print(f"Procesando tweet {idx}/{total}")
    tweet_text = item.get("tweet", "")
    print(" Tweet:", tweet_text)
    tweet_analysis = analyze_text(tweet_text, lang)
    print(" Analysis tweet:", json.dumps(tweet_analysis, ensure_ascii=False, indent=2))
    item["analysis"] = tweet_analysis

    replies = item.get("replies", [])
    for r_idx, reply in enumerate(replies, start=1):
        print(f"  Procesando reply {r_idx}/{len(replies)}")
        reply_text = reply.get("reply", "")
        print("   Reply:", reply_text)
        reply_analysis = analyze_text(reply_text, lang)
        print("   Analysis reply:", json.dumps(reply_analysis, ensure_ascii=False, indent=2))
        reply["analysis"] = reply_analysis

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"\n✅ Análisis completado. Resultado guardado en {output_path}")




Procesando tweet 1/300
 Tweet: Algo importante se me escapa. 
Cuál es la clave de estos movimientos por “la Salud y la Libertad”

 Quizás la desconfianza y el escepticismo de sectores ciudadanos hacia todo lo que sea política, de la mano de los populismos ?
 Analysis tweet: {
  "sentiment": {
    "label": "neutral",
    "score": 0.8190348148345947
  },
  "emotions": [
    {
      "label": "joy",
      "score": 0.36138203740119934
    },
    {
      "label": "anger",
      "score": 0.26649489998817444
    },
    {
      "label": "optimism",
      "score": 0.2236553579568863
    }
  ],
  "hate": {
    "label": "not hate",
    "score": 0.7658279538154602
  },
  "translation": "Something important escapes me. What is the key to these movements by “health and Liberty” Maybe the distrust and scepticism of citizens’ sectors towards everything political, from the hands of the populists?",
  "summary": " What is the key to these movements by “health and Liberty” Maybe the distrust and scepticis

KeyboardInterrupt: 

In [None]:
!pip install sacremoses

In [2]:
import json
import stanza

# Descargar el modelo español (solo la primera vez)
stanza.download("es")

# Crear el pipeline
nlp = stanza.Pipeline("es", processors="tokenize,sentiment", use_gpu=True)

# Función para analizar con Stanza
def analyze_with_stanza(text):
    if not text.strip():
        return {"sentiment": None}
    doc = nlp(text)
    sentiments = [{"text": s.text, "sentiment": s.sentiment} for s in doc.sentences]
    overall_sentiment = round(sum(s["sentiment"] for s in sentiments) / len(sentiments)) if sentiments else None
    return {"sentiment": overall_sentiment, "sentences": sentiments}

# Ruta del archivo de entrada
input_path = "spain/Completado/tweets_españoles_evaluados_completo.json"

# Cargar datos
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Analizar solo replies y añadir sentiment_stanza
for idx, item in enumerate(data, start=1):
    print(f"Procesando tweet {idx}/{len(data)}")
    for reply in item.get("replies", []):
        texto = reply.get("reply", "")
        resultado_stanza = analyze_with_stanza(texto)
        reply["analysis"]["sentiment_stanza"] = resultado_stanza  # Añadir al bloque analysis

# Guardar el resultado enriquecido
output_path = "spain/Tweets_conversaciones_con_stanza.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ Análisis de sentimiento con Stanza añadido a todas las replies.")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-30 07:55:51 INFO: Downloaded file to /home/jupyter-lquijano/stanza_resources/resources.json
2025-04-30 07:55:51 INFO: Downloading default packages for language: es (Spanish) ...
2025-04-30 07:55:53 INFO: File exists: /home/jupyter-lquijano/stanza_resources/es/default.zip
2025-04-30 07:55:56 INFO: Finished downloading models and saved to /home/jupyter-lquijano/stanza_resources
2025-04-30 07:55:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-30 07:55:56 INFO: Downloaded file to /home/jupyter-lquijano/stanza_resources/resources.json
2025-04-30 07:55:57 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| sentiment | tass2020_charlm |

2025-04-30 07:55:57 INFO: Using device: cpu
2025-04-30 07:55:57 INFO: Loading: tokenize
2025-04-30 07:55:57 INFO: Loading: mwt
2025-04-30 07:55:57 INFO: Loading: sentiment
2025-04-30 07:55:58 INFO: Done loading processors!


Procesando tweet 1/1000
Procesando tweet 2/1000
Procesando tweet 3/1000
Procesando tweet 4/1000
Procesando tweet 5/1000
Procesando tweet 6/1000
Procesando tweet 7/1000
Procesando tweet 8/1000
Procesando tweet 9/1000
Procesando tweet 10/1000
Procesando tweet 11/1000
Procesando tweet 12/1000
Procesando tweet 13/1000
Procesando tweet 14/1000
Procesando tweet 15/1000
Procesando tweet 16/1000
Procesando tweet 17/1000
Procesando tweet 18/1000
Procesando tweet 19/1000
Procesando tweet 20/1000
Procesando tweet 21/1000
Procesando tweet 22/1000
Procesando tweet 23/1000
Procesando tweet 24/1000
Procesando tweet 25/1000
Procesando tweet 26/1000
Procesando tweet 27/1000
Procesando tweet 28/1000
Procesando tweet 29/1000
Procesando tweet 30/1000
Procesando tweet 31/1000
Procesando tweet 32/1000
Procesando tweet 33/1000
Procesando tweet 34/1000
Procesando tweet 35/1000
Procesando tweet 36/1000
Procesando tweet 37/1000
Procesando tweet 38/1000
Procesando tweet 39/1000
Procesando tweet 40/1000
Procesand