In [1]:
# Download the dataset

!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis -p .data/ --unzip

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to .data
100%|██████████████████████████████████████| 1.99M/1.99M [00:00<00:00, 3.03MB/s]
100%|██████████████████████████████████████| 1.99M/1.99M [00:00<00:00, 2.81MB/s]


In [2]:
# Load dataset
import pandas as pd
import numpy as np

sentiment = pd.read_csv('.data/twitter_training.csv')
sentiment_validation = pd.read_csv('.data/twitter_validation.csv')

# Add column names
sentiment.columns = ['id', 'tag', 'sentiment', 'text']
sentiment_validation.columns = ['id', 'tag', 'sentiment', 'text']

sentiment

Unnamed: 0,id,tag,sentiment,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
# Filter for Positive/Negative only
sentiment = sentiment[sentiment['sentiment'].isin(['Positive', 'Negative'])]
sentiment_validation = sentiment_validation[sentiment_validation['sentiment'].isin(['Positive', 'Negative'])]

In [4]:
sentiment['sentiment'].value_counts()

sentiment
Negative    22542
Positive    20831
Name: count, dtype: int64

In [5]:
sentiment_validation['sentiment'].value_counts()

sentiment
Positive    277
Negative    266
Name: count, dtype: int64

In [6]:
# Validate that 'text' is not null or empty
sentiment = sentiment.dropna(subset=['text'])
sentiment_validation = sentiment_validation.dropna(subset=['text'])

In [7]:
# Compute an embedding, using BERT.
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

count = 0

# Function to compute the embedding for a single sentence
def get_embedding(text):
    global count
    inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)
    outputs = model(inputs)
    count += 1
    if count % 1000 == 0:
        print(f'{count},', end='')
    return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().squeeze()

# Compute embeddings for each text in both DataFrames
sentiment['embedding'] = sentiment['text'].apply(get_embedding)
sentiment_validation['embedding'] = sentiment_validation['text'].apply(get_embedding)

# Display the DataFrames with the embeddings
print("Sentiment DataFrame with BERT Embeddings")
print(sentiment.head())

print("\nSentiment Validation DataFrame with BERT Embeddings")
print(sentiment_validation.head())

2024-05-25 14:24:22.450070: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-25 14:24:22.450117: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-25 14:24:22.450129: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-25 14:24:22.450158: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-25 14:24:22.450177: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions

27000,

In [None]:
# Drop the 'tag' and 'id' columns
sentiment = sentiment.drop(columns=['tag', 'id'])
sentiment_validation = sentiment_validation.drop(columns=['tag', 'id'])

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Assuming the embeddings are already added to the DataFrames

# Extract features (embeddings) and labels
X = np.vstack(sentiment['embedding'].values)
y = sentiment['sentiment'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Test Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')
