In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Datasets/stock_tweet_sentiment.csv')

# Check the columns and unique values in the Sentiment column
print(f"Columns: {df.columns}")
print(f"Unique Sentiment values:\n{df['Sentiment'].value_counts()}")

# Step 1: Remove rows with missing data
df = df.dropna(subset=['text', 'Sentiment'])  # Ensure both columns have valid values

# Step 2: Remove rows with empty or whitespace-only 'text'
df['text'] = df['text'].str.strip()  # Strip leading/trailing spaces
df = df[df['text'] != '']  # Drop rows where 'text' is empty after stripping spaces

# Step 3: Filter out invalid Sentiment values
valid_labels = [0, 1, 2]  # Assuming sentiment labels are 0, 1, or 2
df = df[df['Sentiment'].isin(valid_labels)]

# Debug: Check dataset shape and label distribution after cleaning
print(f"Dataset shape after cleaning: {df.shape}")
print(f"Sentiment label distribution:\n{df['Sentiment'].value_counts()}")

# Save the cleaned dataset
df.to_csv('Datasets/cleaned_stock_tweet_sentiment.csv', index=False)


Columns: Index(['Unnamed: 0', 'text', 'timestamp', 'source', 'symbols', 'company_names',
       'Sentiment'],
      dtype='object')
Unique Sentiment values:
Sentiment
 0    17330
 1     8512
-1     2598
Name: count, dtype: int64
Dataset shape after cleaning: (25842, 7)
Sentiment label distribution:
Sentiment
0    17330
1     8512
Name: count, dtype: int64


In [2]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Return input tensors and label
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Split the dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['Sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_length=128)


In [3]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = logits.argmax(axis=1)

    # Calculate accuracy, precision, recall, and F1 score
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/5172 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


KeyboardInterrupt: 

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Download required NLTK resources
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the new dataset
new_data = pd.read_csv('Datasets/tesla_and_others.csv')

# Filter rows for Tesla
filtered_data = new_data[new_data['Stock Name'] == 'TSLA']

def preprocess_tweet(tweet):
    # Initialize lemmatizer and stopwords
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tweet = re.sub(r'http\S+|www.\S+|@\w+|#\w+', '', tweet)  # Remove URLs and mentions
    tweet = tweet.lower()  # Convert to lowercase
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove punctuation
    tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
    tweet = ' '.join(word for word in tweet.split() if word not in stop_words)  # Remove stopwords
    tweet = ' '.join(lemmatizer.lemmatize(word) for word in tweet.split())  # Lemmatize words

    return tweet


filtered_data['Tweet'] = filtered_data['Tweet'].apply(preprocess_tweet)

# Load the fine-tuned BERT model and tokenizer
model_name = "sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a Dataset and DataLoader for batching
class TweetDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_length):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        encoding = self.tokenizer(
            tweet,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}

# Initialize Dataset and DataLoader
tweet_dataset = TweetDataset(
    tweets=filtered_data['Tweet'].tolist(),
    tokenizer=tokenizer,
    max_length=128
)
batch_size = 32
data_loader = DataLoader(tweet_dataset, batch_size=batch_size)

# Make predictions in batches
all_predictions = []

model.eval()
with torch.no_grad():
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(predictions)

filtered_data['Predicted Sentiment'] = np.array(all_predictions) # Add predictions to the DataFrame
filtered_data.to_csv("tsla_sentiment_predictions.csv", index=False)

# preview of the results
print(filtered_data.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Tweet'] = filtered_data['Tweet'].apply(preprocess_tweet)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 10346434560 bytes.