In [None]:
# Si te faltan dependencias, descomenta y ejecuta:
# !pip install torch transformers sentencepiece tiktoken protobuf

import json
from transformers import pipeline, logging

# Suprimir warnings de Transformers
logging.set_verbosity_error()

# Forzar uso de CPU (cambia a 0 para GPU si tu VRAM lo permite)
DEVICE = -1

# 1) Configuración de pipelines
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=DEVICE
)
emotion_pipe = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-emotion",
    tokenizer="cardiffnlp/twitter-roberta-base-emotion",
    return_all_scores=True,
    device=DEVICE
)
hate_pipe = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    tokenizer="joeddav/xlm-roberta-large-xnli",
    device=DEVICE
)
translation_pipe = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",
    tokenizer="Helsinki-NLP/opus-mt-mul-en",
    device=DEVICE
)
summarization_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    tokenizer="sshleifer/distilbart-cnn-12-6",
    device=DEVICE
)

# 2) Función de análisis (igual que antes)
def analyze_text(text, lang):
    text = text or ""
    if not text.strip():
        return {"sentiment": None, "emotions": [], "hate": None, "translation": None, "summary": None}
    res = {}
    # Sentiment
    s = sentiment_pipe(text)[0]
    res["sentiment"] = {"label": s["label"], "score": float(s["score"])}
    # Emotions top 3
    emos = emotion_pipe(text)[0]
    top3 = sorted(emos, key=lambda x: x["score"], reverse=True)[:3]
    res["emotions"] = [{"label": e["label"], "score": float(e["score"])} for e in top3]
    # Hate vs Not Hate
    try:
        z = hate_pipe(text, candidate_labels=["hate", "not hate"])
        res["hate"] = {"label": z["labels"][0], "score": float(z["scores"][0])}
    except ValueError:
        res["hate"] = None
    # Translation
    if lang != "en":
        tr = translation_pipe(text, max_length=256)[0]["translation_text"]
        res["translation"] = tr
        summary_input = tr
    else:
        res["translation"] = text
        summary_input = text
    # Summarization dinámico
    tok = summarization_pipe.tokenizer
    ids = tok.encode(summary_input, return_tensors="pt")[0]
    max_len = max(5, min(60, len(ids) - 2))
    sm = summarization_pipe(summary_input, max_length=max_len, min_length=5)[0]["summary_text"]
    res["summary"] = sm
    return res

# 3) Cargar datos
input_path = "tweets_griegos_anotados_finales.json"
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

lang = "el"  # 'el' para griego

# 4) Procesar en dos trozos: indices 0–499 y 500–999
ranges = [(0, 500), (500, 1000)]
for start, end in ranges:
    subset = data[start:end]
    total = len(subset)
    output_path = f"tweets_griegos_evaluados_{start+1}_{start+total}.json"
    
    for idx, item in enumerate(subset, start=1):
        print(f"Procesando chunk {start+1}-{end}, tweet {idx}/{total}")
        # tweet principal
        item["analysis"] = analyze_text(item.get("tweet", ""), lang)
        # todas las replies
        for r_idx, reply in enumerate(item.get("replies", []), start=1):
            reply["analysis"] = analyze_text(reply.get("reply", ""), lang)
    
    # guardar este chunk
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(subset, f, ensure_ascii=False, indent=2)
    print(f"Chunk {start+1}-{start+total} completado. Salvo en {output_path}\n")


In [None]:
# Si te faltan dependencias, descomenta y ejecuta:
# !pip install torch transformers sentencepiece tiktoken protobuf

import json
from transformers import pipeline, logging

# Suprimir warnings de Transformers
logging.set_verbosity_error()

# Forzar uso de CPU (cambia a 0 para GPU si tu VRAM lo permite)
DEVICE = 0

# 1) Configuración de pipelines
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=DEVICE
)
emotion_pipe = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-emotion",
    tokenizer="cardiffnlp/twitter-roberta-base-emotion",
    return_all_scores=True,
    device=DEVICE
)
hate_pipe = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    tokenizer="joeddav/xlm-roberta-large-xnli",
    device=DEVICE
)
translation_pipe = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",
    tokenizer="Helsinki-NLP/opus-mt-mul-en",
    device=DEVICE
)
summarization_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    tokenizer="sshleifer/distilbart-cnn-12-6",
    device=DEVICE
)

# 2) Función de análisis (igual que antes)
def analyze_text(text, lang):
    text = text or ""
    if not text.strip():
        return {"sentiment": None, "emotions": [], "hate": None, "translation": None, "summary": None}
    res = {}
    # Sentiment
    s = sentiment_pipe(text)[0]
    res["sentiment"] = {"label": s["label"], "score": float(s["score"])}
    # Emotions top 3
    emos = emotion_pipe(text)[0]
    top3 = sorted(emos, key=lambda x: x["score"], reverse=True)[:3]
    res["emotions"] = [{"label": e["label"], "score": float(e["score"])} for e in top3]
    # Hate vs Not Hate
    try:
        z = hate_pipe(text, candidate_labels=["hate", "not hate"])
        res["hate"] = {"label": z["labels"][0], "score": float(z["scores"][0])}
    except ValueError:
        res["hate"] = None
    # Translation
    if lang != "en":
        tr = translation_pipe(text, max_length=256)[0]["translation_text"]
        res["translation"] = tr
        summary_input = tr
    else:
        res["translation"] = text
        summary_input = text
    # Summarization dinámico
    tok = summarization_pipe.tokenizer
    ids = tok.encode(summary_input, return_tensors="pt")[0]
    max_len = max(5, min(60, len(ids) - 2))
    sm = summarization_pipe(summary_input, max_length=max_len, min_length=5)[0]["summary_text"]
    res["summary"] = sm
    return res

# 3) Cargar datos
input_path = "tweets_españoles_anotados_finales_id.json"
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

lang = "es"  # 'el' para griego es español

# 4) Procesar en dos trozos: indices 0–499 y 500–999
ranges = [(0, 500), (500, 1000)]
for start, end in ranges:
    subset = data[start:end]
    total = len(subset)
    output_path = f"tweets_españoles_evaluados_{start+1}_{start+total}.json"
    
    for idx, item in enumerate(subset, start=1):
        print(f"Procesando chunk {start+1}-{end}, tweet {idx}/{total}")
        # tweet principal
        item["analysis"] = analyze_text(item.get("tweet", ""), lang)
        # todas las replies
        for r_idx, reply in enumerate(item.get("replies", []), start=1):
            reply["analysis"] = analyze_text(reply.get("reply", ""), lang)
    
    # guardar este chunk
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(subset, f, ensure_ascii=False, indent=2)
    print(f"Chunk {start+1}-{start+total} completado. Salvo en {output_path}\n")


In [2]:
# Requiere stanza: ejecuta esto solo una vez si no lo tienes
# !pip install stanza

import json
import stanza

# Descargar modelo en griego (solo la primera vez)
stanza.download("es")

# Cargar el pipeline de Stanza en griego
nlp = stanza.Pipeline("es", processors="tokenize,sentiment", use_gpu=True)

# Función para analizar sentimiento con Stanza
def analyze_with_stanza(text):
    if not text.strip():
        return {"sentiment": None}
    doc = nlp(text)
    sentiments = [{"text": s.text, "sentiment": s.sentiment} for s in doc.sentences]
    overall_sentiment = round(sum(s["sentiment"] for s in sentiments) / len(sentiments)) if sentiments else None
    return {"sentiment": overall_sentiment, "sentences": sentiments}

# Ruta del archivo de entrada
input_path = "spain/Completado/tweets_españoles_evaluados_completo.json"
# Cargar datos
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Procesar por bloques: 0–499 y 500–999
ranges = [(0, 500), (500, 1000)]
for start, end in ranges:
    subset = data[start:end]
    total = len(subset)
    output_path = f"tweets_griegos_stanza_{start+1}_{start+total}.json"  # Nombre corregido

    for idx, item in enumerate(subset, start=1):
        print(f"Procesando bloque {start+1}-{end}, tweet {idx}/{total}")
        # Tweet principal
        item["analysis"] = analyze_with_stanza(item.get("tweet", ""))
        # Replies
        for reply in item.get("replies", []):
            reply["analysis"] = analyze_with_stanza(reply.get("reply", ""))

    # Guardar resultados
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(subset, f, ensure_ascii=False, indent=2)
    print(f"✅ Bloque {start+1}-{start+total} guardado como {output_path}")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-01 11:51:04 INFO: Downloaded file to /home/jupyter-lquijano/stanza_resources/resources.json
2025-05-01 11:51:04 INFO: Downloading default packages for language: es (Spanish) ...
2025-05-01 11:51:05 INFO: File exists: /home/jupyter-lquijano/stanza_resources/es/default.zip
2025-05-01 11:51:09 INFO: Finished downloading models and saved to /home/jupyter-lquijano/stanza_resources
2025-05-01 11:51:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-01 11:51:09 INFO: Downloaded file to /home/jupyter-lquijano/stanza_resources/resources.json
2025-05-01 11:51:09 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| sentiment | tass2020_charlm |

2025-05-01 11:51:09 INFO: Using device: cuda
2025-05-01 11:51:09 INFO: Loading: tokenize
2025-05-01 11:51:09 INFO: Loading: mwt
2025-05-01 11:51:09 INFO: Loading: sentiment
2025-05-01 11:51:11 INFO: Done loading processors!


Procesando bloque 1-500, tweet 1/500
Procesando bloque 1-500, tweet 2/500
Procesando bloque 1-500, tweet 3/500
Procesando bloque 1-500, tweet 4/500
Procesando bloque 1-500, tweet 5/500
Procesando bloque 1-500, tweet 6/500
Procesando bloque 1-500, tweet 7/500
Procesando bloque 1-500, tweet 8/500
Procesando bloque 1-500, tweet 9/500
Procesando bloque 1-500, tweet 10/500
Procesando bloque 1-500, tweet 11/500
Procesando bloque 1-500, tweet 12/500
Procesando bloque 1-500, tweet 13/500
Procesando bloque 1-500, tweet 14/500
Procesando bloque 1-500, tweet 15/500
Procesando bloque 1-500, tweet 16/500
Procesando bloque 1-500, tweet 17/500
Procesando bloque 1-500, tweet 18/500
Procesando bloque 1-500, tweet 19/500
Procesando bloque 1-500, tweet 20/500
Procesando bloque 1-500, tweet 21/500
Procesando bloque 1-500, tweet 22/500
Procesando bloque 1-500, tweet 23/500
Procesando bloque 1-500, tweet 24/500
Procesando bloque 1-500, tweet 25/500
Procesando bloque 1-500, tweet 26/500
Procesando bloque 1-5

In [2]:
!pip install numpy==1.26.4

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
Successfully installed numpy-1.26.4


In [None]:
pip install numpy==1.26.4
