In [4]:
import re
import pandas as pd
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download("vader_lexicon")

# Config
INPUT_CSV = "../dados/tweets_ia_personality.csv"  # CSV que você já salvou
OUTPUT_CSV = "../dados/tweets_ia_sentimento.csv"
SPACY_MODEL = "en_core_web_sm"

sia = SentimentIntensityAnalyzer()

# termos IA + personalidade
ia_terms = [
    "ai", "artificial intelligence", "inteligencia artificial", "inteligência artificial",
    "chatgpt", "gpt", "llm", "language model", "machine learning",
    "personality", "sentient", "consciousness", "emotions", "trust", "fear", "friendly", "dangerous"
]
pattern_ia = re.compile(r"\b(?:{})\b".format("|".join(re.escape(t) for t in ia_terms)),
                        flags=re.IGNORECASE)

# Função de sentimento
def analisar_sentimento(texto: str) -> str:
    scores = sia.polarity_scores(texto)
    compound = scores["compound"]
    if compound >= 0.05:
        return "positivo"
    elif compound <= -0.05:
        return "negativo"
    else:
        return "neutro"

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    s = text.replace("\n", " ").strip()
    s = re.sub(r"http\S+", " ", s)           # URLs
    s = re.sub(r"www\.\S+", " ", s)
    s = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ0-9\?\!\.,;:\-\'\"\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def mentions_ia(text: str) -> bool:
    return bool(pattern_ia.search(text))

# carregar modelo spaCy
nlp = spacy.load(SPACY_MODEL, disable=["ner"])
nlp.max_length = 2000000

# ler CSV e processar
df = pd.read_csv(INPUT_CSV, encoding="utf-8")

df["cleaned"] = df["tweet"].astype(str).apply(clean_text)
df["mentions_ia"] = df["cleaned"].apply(mentions_ia)

# Filtrar tweets que mencionam IA
df_filtered = df[df["mentions_ia"]].reset_index(drop=True)
print(f"Total: {len(df)} — com IA: {len(df_filtered)}")

# processamento NLP
rows_out = []

for doc, (_, row) in zip(nlp.pipe(df_filtered["cleaned"], batch_size=50), df_filtered.iterrows()):
    # tokens com POS
    tokens_info = [(t.text, t.pos_, t.dep_) for t in doc]

    # chunks (shallow parsing)
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]

    # pares substantivo-adjetivo (dependency parsing)
    adj_noun_pairs = []
    for tok in doc:
        if tok.pos_ == "ADJ":
            if tok.dep_ == "amod" and tok.head.pos_ in ("NOUN", "PROPN"):
                adj_noun_pairs.append((tok.head.text, tok.text))
            else:
                for child in tok.children:
                    if child.dep_ in ("nsubj", "nsubj:pass") and child.pos_ in ("NOUN", "PROPN"):
                        adj_noun_pairs.append((child.text, tok.text))

    rows_out.append({
        "tweet": row["tweet"],
        "date": row["date"],
        "likes": row["likes"],
        "retweets": row["retweets"],
        "cleaned": row["cleaned"],
        "tokens_pos": str(tokens_info),
        "noun_chunks": "; ".join(noun_chunks),
        "adj_noun_pairs": str(adj_noun_pairs),
        "sentimento": analisar_sentimento(row["cleaned"])
    })

# salvar resultado
df_out = pd.DataFrame(rows_out)
df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"Processamento salvo em {OUTPUT_CSV}")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\itsbe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Total: 50 — com IA: 50
Processamento salvo em ../dados/tweets_ia_sentimento.csv
