In [10]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch
import torch.nn.functional as F
import pickle
from transformers import logging
logging.set_verbosity_error()

In [2]:
def combine_text(row):
    content = row['content'] if pd.notna(row['content']) else ''
    hashtags = row['hashtags'] if pd.notna(row['hashtags']) else ''
    mentions = row['mentions'] if pd.notna(row['mentions']) else ''
    return f"{content} {hashtags} {mentions}"

In [3]:

tweets_df = pd.read_csv("realdonaldtrump.csv")

def combine_text(row):
    content = row['content'] if pd.notna(row['content']) else ''
    hashtags = row['hashtags'] if pd.notna(row['hashtags']) else ''
    mentions = row['mentions'] if pd.notna(row['mentions']) else ''
    return f"{content} {hashtags} {mentions}"

tweets_df["text_full"] = tweets_df.apply(combine_text, axis=1)

def clean_tweet_for_vader(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"#", "", text)        
    text = re.sub(r"@", "", text)     
    text = re.sub(r"\s+", " ", text).strip()
    return text


tweets_df["text_clean"] = tweets_df["text_full"].apply(clean_tweet_for_vader)

analyzer = SentimentIntensityAnalyzer() # model dart vader
tweets_df["vader_scores"] = tweets_df["text_clean"].apply(analyzer.polarity_scores)
tweets_df = pd.concat([tweets_df, tweets_df["vader_scores"].apply(pd.Series)], axis=1)


def classify_sentiment(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

tweets_df["sentiment_label"] = tweets_df["compound"].apply(classify_sentiment)

def sentiment_to_number(sentiment):
    if sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1
    else:
        return 2
    
del tweets_df

In [3]:
def preprocess_tweet_bert(text):
    text = re.sub(r"http\S+", "http", text)
    text = re.sub(r"pic\.twitter\.com/\S+", "<IMG>", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"@\S+", "@user", text).strip()
    return text

In [4]:
def tokenize_and_save_stream(data, tokenizer_type, batch_size=32, file_path="tokenized_batches.pkl"):
    with open(file_path, "wb") as f:
        pass
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        encoded_inputs = tokenizer_type(batch, padding=True, truncation=True, max_length=32, return_tensors="pt")
        batch_dict = {k: v.tolist() for k, v in encoded_inputs.items()}
        with open(file_path, "ab") as f:
            pickle.dump(batch_dict, f)
    print(f"Tokeny zapisane do pliku {file_path}")
    

In [8]:
def load_tokenized_batches_stream(model_, device, file_path="tokenized_batches.pkl"):
    model_.eval()
    results = []
    with open(file_path, "rb") as f:
        while True:
            try:
                batch_dict = pickle.load(f)
                batch_tensors = {k: torch.tensor(v).to(device) for k, v in batch_dict.items()}
                with torch.no_grad():
                    outputs = model_(**batch_tensors)
                
                logits = outputs.logits
                probs = F.softmax(logits, dim=1)
                predicted_classes = torch.argmax(probs, dim=1)
                
                results.append((predicted_classes.cpu().numpy(), probs.cpu().numpy()))
            except EOFError:
                break
        return results

In [11]:
# Wczytanie danych
data_bert_df = pd.read_csv("realdonaldtrump.csv")

# Preprocessing
data_bert_df['combined_tweet'] = data_bert_df.apply(combine_text, axis=1)
data_bert_df['combined_tweet_cleared'] = data_bert_df['combined_tweet'].apply(preprocess_tweet_bert)

# Ładowanie modelu
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to("mps")

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [12]:
texts = data_bert_df['combined_tweet_cleared'].tolist()
tokenize_and_save_stream(texts, tokenizer)

Tokeny zapisane do pliku tokenized_batches.pkl


In [13]:
batch_results = load_tokenized_batches_stream(model, device="mps")
# Połączenie wyników z partii
sentiment_classes = []
sentiment_probs = []
for batch_class, batch_prob in batch_results:
    sentiment_classes.extend(batch_class)
    sentiment_probs.extend(batch_prob.tolist())

# Dodanie wyników do DataFrame
result_df = pd.DataFrame({
    'combined_tweet': texts,
    'sentiment_class': sentiment_classes,
    'sentiment_probabilities': sentiment_probs
})

# Mapowanie wyników na etykiety
result_df['sentiment_label'] = result_df['sentiment_class'].map({0: 'negative', 1: 'neutral', 2: 'positive'})

# Wyświetlenie wyników
result_df[['combined_tweet', 'sentiment_label', 'sentiment_probabilities']]


Unnamed: 0,combined_tweet,sentiment_label,sentiment_probabilities
0,Be sure to tune in and watch Donald Trump on L...,neutral,"[0.00428333505988121, 0.6332980990409851, 0.36..."
1,Donald Trump will be appearing on The View tom...,positive,"[0.003935575485229492, 0.4739065170288086, 0.5..."
2,Donald Trump reads Top Ten Financial Tips on L...,positive,"[0.004720405209809542, 0.12900030612945557, 0...."
3,New Blog Post: Celebrity Apprentice Finale and...,neutral,"[0.004224149510264397, 0.7814801335334778, 0.2..."
4,"""My persona will never be that of a wallflower...",neutral,"[0.23554037511348724, 0.5717381834983826, 0.19..."
...,...,...,...
43347,Joe Biden was a TOTAL FAILURE in Government. H...,negative,"[0.9519144296646118, 0.04053443297743797, 0.00..."
43348,Will be interviewed on @ seanhannity tonight a...,positive,"[0.0016347389901056886, 0.07318022102117538, 0..."
43349,<IMG>,neutral,"[0.0871957391500473, 0.8049344420433044, 0.107..."
43350,<IMG>,neutral,"[0.0871957391500473, 0.8049344420433044, 0.107..."


/opt/homebrew/opt/python@3.11/bin/python3.11
