In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
from transformers import logging
from torchinfo import summary
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
from sklearn.metrics import classification_report, confusion_matrix
import random
logging.set_verbosity_error()

In [2]:
def combine_text(row):
    content = row['content'] if pd.notna(row['content']) else ''
    hashtags = row['hashtags'] if pd.notna(row['hashtags']) else ''
    mentions = row['mentions'] if pd.notna(row['mentions']) else ''
    return f"{content} {hashtags} {mentions}"

In [8]:
tweets_df = pd.read_csv("realdonaldtrump.csv")

tweets_df["text_full"] = tweets_df.apply(combine_text, axis=1)

def clean_tweet_for_vader(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"#", "", text)        
    text = re.sub(r"@", "", text)     
    text = re.sub(r"\s+", " ", text).strip()
    return text


tweets_df["text_clean"] = tweets_df["text_full"].apply(clean_tweet_for_vader)

analyzer = SentimentIntensityAnalyzer() # model dart vader
tweets_df["vader_scores"] = tweets_df["text_clean"].apply(analyzer.polarity_scores)
tweets_df = pd.concat([tweets_df, tweets_df["vader_scores"].apply(pd.Series)], axis=1)


def classify_sentiment(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

tweets_df["sentiment_label"] = tweets_df["compound"].apply(classify_sentiment)

def sentiment_to_number(sentiment):
    if sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1
    else:
        return 2
    
del tweets_df

In [3]:
def preprocess_tweet_bert(text):
    text = re.sub(r"http\S+", "http", text)
    text = re.sub(r"pic\.twitter\.com/\S+", "<IMG>", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"@\S+", "@user", text).strip()
    text = re.sub(r'@ +\w+', '@user', text)
    return text

In [4]:
def tokenize_and_save_stream(data, tokenizer_type, batch_size=32, file_path="tokenized_batches.pkl"):
    with open(file_path, "wb") as f:
        pass
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        encoded_inputs = tokenizer_type(batch, padding=True, truncation=True, max_length=32, return_tensors="pt")
        batch_dict = {k: v.tolist() for k, v in encoded_inputs.items()}
        with open(file_path, "ab") as f:
            pickle.dump(batch_dict, f)
    print(f"Tokeny zapisane do pliku {file_path}")
    

In [5]:
def load_tokenized_batches_stream(model_, device, file_path="tokenized_batches.pkl"):
    model_.eval()
    results = []
    with open(file_path, "rb") as f:
        while True:
            try:
                batch_dict = pickle.load(f)
                batch_tensors = {k: torch.tensor(v).to(device) for k, v in batch_dict.items()}
                with torch.no_grad():
                    outputs = model_(**batch_tensors)
                
                logits = outputs.logits
                probs = F.softmax(logits, dim=1)
                predicted_classes = torch.argmax(probs, dim=1)
                
                results.append((predicted_classes.cpu().numpy(), probs.cpu().numpy()))
            except EOFError:
                break
        return results

In [6]:
# Wczytanie danych
data_bert_df = pd.read_csv("realdonaldtrump.csv")

# Preprocessing
data_bert_df['combined_tweet'] = data_bert_df.apply(combine_text, axis=1)
data_bert_df['combined_tweet_cleared'] = data_bert_df['combined_tweet'].apply(preprocess_tweet_bert)

# Ładowanie modelu
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to("mps")
input_ids = torch.randint(0, 1000, (1, 32), dtype=torch.long).to("mps")
attention_mask = torch.ones((1, 32), dtype=torch.long).to("mps")
summary(model, input_data=(input_ids, attention_mask), device="mps")

Layer (type:depth-idx)                                            Output Shape              Param #
RobertaForSequenceClassification                                  [1, 3]                    --
├─RobertaModel: 1-1                                               [1, 32, 768]              --
│    └─RobertaEmbeddings: 2-1                                     [1, 32, 768]              --
│    │    └─Embedding: 3-1                                        [1, 32, 768]              38,603,520
│    │    └─Embedding: 3-2                                        [1, 32, 768]              768
│    │    └─Embedding: 3-3                                        [1, 32, 768]              394,752
│    │    └─LayerNorm: 3-4                                        [1, 32, 768]              1,536
│    │    └─Dropout: 3-5                                          [1, 32, 768]              --
│    └─RobertaEncoder: 2-2                                        [1, 32, 768]              --
│    │    └─ModuleList: 3-6 

In [7]:
texts = data_bert_df['combined_tweet_cleared'].tolist()
# tokenize_and_save_stream(texts, tokenizer)

In [8]:
batch_results = load_tokenized_batches_stream(model, device="mps")
# Połączenie wyników z partii
sentiment_classes = []
sentiment_probs = []
for batch_class, batch_prob in batch_results:
    sentiment_classes.extend(batch_class)
    sentiment_probs.extend(batch_prob.tolist())

# Dodanie wyników do DataFrame
bert_result_df = pd.DataFrame({
    'combined_tweet': texts,
    'sentiment_class': sentiment_classes,
    'sentiment_probabilities': sentiment_probs
})

# Mapowanie wyników na etykiety
bert_result_df['sentiment_label'] = bert_result_df['sentiment_class'].map({0: 'negative', 1: 'neutral', 2: 'positive'})

# Wyświetlenie wyników
bert_result_df[['combined_tweet', 'sentiment_label', 'sentiment_probabilities']]


Unnamed: 0,combined_tweet,sentiment_label,sentiment_probabilities
0,Be sure to tune in and watch Donald Trump on L...,neutral,"[0.00428333505988121, 0.6332980990409851, 0.36..."
1,Donald Trump will be appearing on The View tom...,positive,"[0.003935575485229492, 0.4739065170288086, 0.5..."
2,Donald Trump reads Top Ten Financial Tips on L...,positive,"[0.004720405209809542, 0.12900030612945557, 0...."
3,New Blog Post: Celebrity Apprentice Finale and...,neutral,"[0.004224149510264397, 0.7814801335334778, 0.2..."
4,"""My persona will never be that of a wallflower...",neutral,"[0.23554037511348724, 0.5717381834983826, 0.19..."
...,...,...,...
43347,Joe Biden was a TOTAL FAILURE in Government. H...,negative,"[0.9519144296646118, 0.04053443297743797, 0.00..."
43348,Will be interviewed on @user tonight at 9:00 P...,positive,"[0.0016347389901056886, 0.07318022102117538, 0..."
43349,<IMG>,neutral,"[0.0871957391500473, 0.8049344420433044, 0.107..."
43350,<IMG>,neutral,"[0.0871957391500473, 0.8049344420433044, 0.107..."


In [9]:
def preprocess_simple_tokenize(text):
    text = text.lower()
    text = re.sub(r"http\S+", "<URL>", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [10]:
X_data = pd.DataFrame({
    'tweet': bert_result_df['combined_tweet'].apply(preprocess_simple_tokenize),
    'sentiment_label': bert_result_df['sentiment_label'],
    'sentiment_label_numeric': bert_result_df['sentiment_label'].map({'negative': 0, 'neutral': 1, 'positive': 2})
})
X_data


Unnamed: 0,tweet,sentiment_label,sentiment_label_numeric
0,be sure to tune in and watch donald trump on l...,neutral,1
1,donald trump will be appearing on the view tom...,positive,2
2,donald trump reads top ten financial tips on l...,positive,2
3,new blog post celebrity apprentice finale and ...,neutral,1
4,my persona will never be that of a wallflower ...,neutral,1
...,...,...,...
43347,joe biden was a total failure in government he...,negative,0
43348,will be interviewed on user tonight at 900 pm ...,positive,2
43349,img,neutral,1
43350,img,neutral,1


# Model MLP
---
## Architecture



In [22]:
word2idx = {'<pad>': 0, '<unk>': 1}
for tweet in X_data['tweet']:
    for word in tweet.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)

def tokenize(tweets, word2idx):
    tokenized = []
    for tweet in tweets:
        indices = [word2idx.get(word, word2idx['<unk>']) for word in tweet.split()]
        tokenized.append(indices)
    return tokenized


tokenized_tweets = tokenize(X_data['tweet'], word2idx)
max_len = max(len(t) for t in tokenized_tweets)
padded_tweets = [t + [word2idx['<pad>']] * (max_len - len(t)) for t in tokenized_tweets]


X = torch.tensor(padded_tweets, dtype=torch.long)
y = torch.tensor(X_data['sentiment_label_numeric'], dtype=torch.long)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [23]:
class MLPWithMeanPooling(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dims, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<pad>'])

        layers = []
        input_dim = embedding_dim  # po uśrednieniu
        for hdim in hidden_dims:
            layers.append(nn.Linear(input_dim, hdim))
            layers.append(nn.BatchNorm1d(hdim))
            layers.append(nn.LeakyReLU())
            layers.append(nn.Dropout(0.5))
            input_dim = hdim

        layers.append(nn.Linear(input_dim, output_dim))
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch, seq, emb)
        mask = (x != word2idx['<pad>']).unsqueeze(2)  # (batch, seq, 1)
        masked_emb = embedded * mask  # zero-out padding
        pooled = masked_emb.sum(dim=1) / mask.sum(dim=1).clamp(min=1)  # średnia tylko z prawdziwych słów
        return self.fc(pooled)

In [24]:
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
model = MLPWithMeanPooling(vocab_size=len(word2idx), embedding_dim=100,
                           hidden_dims=[64, 32], output_dim=3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

In [25]:
for epoch in range(10):
    model.train()
    correct, total, running_loss = 0, 0, 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_x.size(0)
        correct += (outputs.argmax(1) == batch_y).sum().item()
        total += batch_y.size(0)
    
    print(f"Epoch {epoch+1}: Loss={running_loss/total:.4f}, Acc={correct/total:.4f}")

Epoch 1: Loss=0.9495, Acc=0.5434
Epoch 2: Loss=0.7657, Acc=0.6693
Epoch 3: Loss=0.6691, Acc=0.7244
Epoch 4: Loss=0.6009, Acc=0.7590
Epoch 5: Loss=0.5507, Acc=0.7822
Epoch 6: Loss=0.5065, Acc=0.8062
Epoch 7: Loss=0.4711, Acc=0.8232
Epoch 8: Loss=0.4372, Acc=0.8367
Epoch 9: Loss=0.4077, Acc=0.8507
Epoch 10: Loss=0.3844, Acc=0.8599


In [26]:
model.eval()
test_loss = 0.0
correct = 0
total = 0

all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to("mps"), batch_y.to("mps")
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        test_loss += loss.item() * batch_x.size(0)
        
        _, preds = torch.max(outputs, 1)
        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

test_loss /= total
accuracy = correct / total

print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Classification Report:\n")
print(classification_report(all_labels, all_preds, target_names=['negative', 'neutral', 'positive']))

print("Confusion Matrix:\n")
print(confusion_matrix(all_labels, all_preds))


indices = random.sample(range(len(all_labels)), 20)
for i in indices:
    print(f"Sample {i+1}: True: {['negative', 'neutral', 'positive'][all_labels[i]]}, Predicted: {['negative', 'neutral', 'positive'][all_preds[i]]}")


Test Loss: 0.6313
Test Accuracy: 0.7594

Classification Report:

              precision    recall  f1-score   support

    negative       0.73      0.81      0.77      2463
     neutral       0.66      0.61      0.63      2404
    positive       0.84      0.82      0.83      3804

    accuracy                           0.76      8671
   macro avg       0.74      0.75      0.74      8671
weighted avg       0.76      0.76      0.76      8671

Confusion Matrix:

[[2002  299  162]
 [ 518 1456  430]
 [ 230  447 3127]]
Sample 6433: True: neutral, Predicted: neutral
Sample 6656: True: positive, Predicted: positive
Sample 3956: True: negative, Predicted: negative
Sample 6452: True: neutral, Predicted: neutral
Sample 3373: True: positive, Predicted: positive
Sample 2028: True: positive, Predicted: positive
Sample 4481: True: positive, Predicted: positive
Sample 4858: True: neutral, Predicted: neutral
Sample 2003: True: positive, Predicted: positive
Sample 7597: True: positive, Predicted: posi

In [27]:
model.eval()
train_loss = 0.0
correct = 0
total = 0

all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to("mps"), batch_y.to("mps")
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        train_loss += loss.item() * batch_x.size(0)
        
        _, preds = torch.max(outputs, 1)
        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

train_loss /= total
accuracy = correct / total

print(f"\nTrain Loss: {train_loss:.4f}")
print(f"Train Accuracy: {accuracy:.4f}\n")


print("Classification Report (Train):\n")
print(classification_report(all_labels, all_preds, target_names=['negative', 'neutral', 'positive']))

print("Confusion Matrix (Train):\n")
print(confusion_matrix(all_labels, all_preds))


Train Loss: 0.2784
Train Accuracy: 0.9015

Classification Report (Train):

              precision    recall  f1-score   support

    negative       0.87      0.95      0.91     10058
     neutral       0.87      0.80      0.83      9318
    positive       0.95      0.93      0.94     15305

    accuracy                           0.90     34681
   macro avg       0.89      0.89      0.89     34681
weighted avg       0.90      0.90      0.90     34681

Confusion Matrix (Train):

[[ 9578   375   105]
 [ 1149  7482   687]
 [  318   781 14206]]


In [6]:
%load_ext autoreload
%autoreload 2


In [8]:
from vader_sentiment_analyzer import VaderSentimentAnalyzer

analyzer = VaderSentimentAnalyzer("realdonaldtrump.csv")
analyzer.process()
minimal_df = analyzer.get_minimal_df()
minimal_df

AttributeError: 'VaderSentimentAnalyzer' object has no attribute 'get_minimal_df'