# NER

In [None]:
!pip install -U -q spacy datasets hf_xet
!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import spacy
import torch
from collections import Counter
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import ast

def load_spacy_model():
    if torch.cuda.is_available():
        spacy.require_gpu()
    return spacy.load("en_core_web_trf")
nlp = load_spacy_model()

def extract_entities_from_df(df, text_column="text"):
    # Estrae entità e le formatta come stringa: "[['entity', 'type'], ...]"
    df["entities"] = df[text_column].apply(
        lambda text: str([[ent.text, ent.label_] for ent in nlp(text).ents])
    )
    return df
def get_top_entities(df, exclude_labels=[], top_n=10):
    # Parsing stringhe in liste
    df["entities"] = df["entities"].apply(ast.literal_eval)

    # Estrazione entità filtrate
    filtered_entities = [
        (ent, label)
        for sublist in df["entities"]
        for ent, label in sublist
        if label not in exclude_labels
    ]

    # Conteggio entità
    entity_label_counts = Counter(filtered_entities)

    # Creazione DataFrame con conteggi
    df_counts = pd.DataFrame(entity_label_counts.items(), columns=["entity", "count"])
    df_counts[["text", "label"]] = pd.DataFrame(df_counts["entity"].tolist(), index=df_counts.index)
    df_counts = df_counts.drop(columns=["entity"])
    df_counts = df_counts.sort_values(by="count", ascending=False)

    return df_counts.head(top_n)

def filter_entities(entities_str):
    entities_to_remove = ["ORDINAL", "DATE", "CARDINAL", "MONEY", "TIME", "PERCENT"]
    if not entities_str or entities_str == '[]':
        return '[]'

    try:
        # Converte la stringa in lista
        entities = ast.literal_eval(entities_str)
        # Filtra le entità
        filtered = [e for e in entities if e[1] not in entities_to_remove]
        # Ritorna come stringa
        return str(filtered)
    except Exception as e:
        return '[]'

In [None]:
dataset_name = "ds_obama"
tweets_column = "Tweet-text"
df = pd.read_csv(
    f"/content/drive/MyDrive/big_data/{dataset_name}.csv",
    sep=";",
    engine="python",
    quotechar='"',
    encoding="utf-8",
    on_bad_lines='skip'
)
df.head()

In [None]:
df_with_entities = extract_entities_from_df(df,text_column=tweets_column)
df_with_entities['entities'] = df['entities'].apply(filter_entities)

In [None]:
df_with_entities.head()

In [None]:
top_entities = get_top_entities(df_with_entities,top_n=30)
print(top_entities)

# Zero-Shot Classification

In [None]:
def classify_tweets(df, text_col, entities_col, candidate_labels, use_entities=True, batch_size=32):
    """
    Classifica argomenti nei tweet di un DataFrame usando zero-shot classification,
    sfruttando la potenza della libreria datasets e la funzione map in modalità batch.

    Args:
        df (pd.DataFrame): DataFrame con i dati
        text_col (str): nome colonna con il testo
        entities_col (str): nome colonna con lista di (ent, label)
        candidate_labels (list): lista categorie per classificazione
        use_entities (bool): se True concatena entità al testo
        batch_size (int): dimensione batch per la pipeline

    Returns:
        pd.DataFrame: DataFrame originale con due colonne aggiuntive: 'topic' e 'confidence'
    """

    # Converti il DataFrame in Dataset HuggingFace
    dataset = Dataset.from_pandas(df)

    # Inizializza la pipeline zero-shot classification
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",device=0)

    def entities_to_string(entities):
      if not entities:
        return ""
      if isinstance(entities[0], tuple):
        return " ".join(ent for ent, _ in entities)
      elif isinstance(entities[0], str):
        return " ".join(entities)
      else:
        return ""


    # Funzione da mappare sul dataset (batched)
    def classify_batch(batch):
        inputs = []
        for text, entities in zip(batch[text_col], batch[entities_col]):
            ent_str = entities_to_string(entities)
            input_text = f"{text} - {ent_str}" if use_entities and ent_str.strip() else text
            inputs.append(input_text)

        results = classifier(inputs, candidate_labels, multi_label=False)

        # Normalizza il risultato se batch di 1
        if isinstance(results, dict):
            results = [results]

        labels = [r["labels"][0] for r in results]
        scores = [r["scores"][0] for r in results]

        return {"topic": labels, "confidence": scores}

    # Applica la funzione con map in modalità batch
    dataset = dataset.map(classify_batch, batched=True, batch_size=batch_size)

    # Riconverti in DataFrame Pandas
    df_result = dataset.to_pandas()

    return df_result

In [None]:
candidate_labels = ["politics", "family", "USA", "climate change", "health", "business", "finance"]
df_ent_topics = classify_tweets(
    df_with_entities,
    text_col=tweets_column,
    entities_col="entities",
    candidate_labels=candidate_labels,
    use_entities=True,
    batch_size=32)

# Threshold per colonna topic con limite superiore a 0.6

In [None]:
mediane_topic = df_ent_topics.groupby('topic')['confidence'].median()
mediane_topic = mediane_topic.apply(lambda x: min(x, 0.6))

mediane_topic_dict = mediane_topic.to_dict()
df_filtrato = df_ent_topics[df_ent_topics.apply(lambda row: row['confidence'] >= mediane_topic_dict.get(row['topic'], 1), axis=1)]

In [None]:
df_filtrato.head()

# Sentiment Analysis

In [None]:
def analyze_sentiment(df, text_col, topic_col=None, entities_col=None, use_topic=True, use_entities=True, batch_size=16):
    """
    Applica sentiment analysis su un DataFrame usando Hugging Face pipeline ottimizzata.

    Args:
        df (pd.DataFrame): DataFrame contenente i tweet
        text_col (str): Nome colonna con il testo
        topic_col (str): Nome colonna con il topic (opzionale)
        entities_col (str): Nome colonna con le entità (opzionale)
        use_topic (bool): Se True, include il topic nel testo
        use_entities (bool): Se True, include le entità nel testo
        batch_size (int): Dimensione batch per la pipeline

    Returns:
        pd.DataFrame: Con sentiment e confidenza
    """
    # Preparazione modello/tokenizer
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    classifier = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        truncation=True,
        padding=True,
        max_length=512,
        device=0
    )

    # Conversione in Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    def entities_to_string(entities):
        if not entities:
            return ""
        if isinstance(entities[0], tuple):
            return " ".join(ent for ent, _ in entities)
        elif isinstance(entities[0], str):
            return " ".join(entities)
        else:
            return ""

    # Funzione per creare input arricchito
    def build_input(text, topic=None, entities=None):
        parts = [text]
        if use_topic and topic:
            parts.append(f"[Topic: {topic}]")
        if use_entities and entities:
            parts.append(f"[Entities: {entities_to_string(entities)}]")
        return " ".join(parts)

    # Funzione da mappare in batch
    def sentiment_batch(batch):
        texts = [build_input(text, batch.get(topic_col, [None]*len(batch[text_col]))[i],
                             batch.get(entities_col, [None]*len(batch[text_col]))[i])
                 for i, text in enumerate(batch[text_col])]

        results = classifier(texts)
        label_map = {
          "LABEL_0": "negative",
          "LABEL_1": "neutral",
          "LABEL_2": "positive"
        }
        return {
            "sentiment": [label_map.get(r["label"], r["label"]).lower() for r in results],
            "sentiment_confidence": [r["score"] for r in results]
        }

    dataset = dataset.map(sentiment_batch, batched=True, batch_size=batch_size)
    return dataset.to_pandas()


In [None]:
df_final = analyze_sentiment(
    df=df_filtrato,
    text_col=tweets_column,
    topic_col="topic",
    entities_col="entities",
    use_topic=False,
    use_entities=True,
    batch_size=32
)
df_final = df_final.drop(columns=["__index_level_0__"])
df_final.to_csv(f"/content/drive/MyDrive/big_data/{dataset_name}_final.csv",index=False)
df_final.head()

In [None]:
df_final.head()

# Prompting

In [None]:
!pip install -U -q llama-cpp-python faiss-cpu sentence-transformers
!mkdir -p /content/models
!wget -O /content/models/phi-2.q4.gguf https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf

In [None]:
import pandas as pd
from llama_cpp import Llama
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch


df_author = pd.read_csv(
    "/content/drive/MyDrive/big_data/ds_obama.csv",
    sep=";",
    engine="python",    
    quotechar='"',     
    encoding="utf-8",   
    on_bad_lines='skip' 
)

encoder = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = encoder.encode(df_author["Tweet-text"].tolist(), convert_to_numpy=True)

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

def retrieve_similar_tweets(df, query, k=5):
    query_embedding = encoder.encode([query])
    distances, indices = index.search(query_embedding, k)
    return df.iloc[indices[0]]["Tweet-text"].tolist(), distances[0]

def create_prompt(df, test_tweet, n_context=3):
    context_tweets = get_best_tweets(df,test_tweet,n_context)
    context_str = "\n".join(f'- "{t}"' for t in context_tweets)
    prompt = f"""I will provide you with a list of tweets written by the same person.

Author's tweets:
{context_str}

Now, consider this new tweet:

"{test_tweet}"
Question: Could this tweet have been written by the same person?
Answer only with YES or NO."""
    return prompt

def get_best_tweets(df, tweet, k=5):
    retrieved_texts, distances = retrieve_similar_tweets(df, tweet, k=k)
    print(retrieved_texts, distances)
    examples = "\n".join([
        f'{i+1}. "{text}"'
        for i, text in enumerate(retrieved_texts)
    ])
    return retrieved_texts

In [None]:
llm = Llama(model_path="/content/models/phi-2.q4.gguf", n_ctx=2048)

In [None]:
test_tweet = """Kobe was a legend on the court and just getting started in what would have been just as meaningful a second act. To lose Gianna is even more heartbreaking to us as parents. Michelle and I send love and prayers to Vanessa and the entire Bryant family on an unthinkable day."""
prompt = create_prompt(df_author, test_tweet, n_context=10)

response = llm(prompt, max_tokens=100, echo=False)
print("Prompt:\n", prompt)
print("\nRisposta:", response['choices'][0]['text'].strip())