In [2]:
import pandas as pd
import numpy as np
from scipy.special import softmax
import json

In [8]:
data = pd.read_csv("Test Data/tweet_only_Test.csv")

In [9]:
# Use GPU to run
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [10]:
data.head()

Unnamed: 0,id,tweet
0,1635954265351921667,@TonyGigi2 @HellsKitchenFOX @opensea @GordonRa...
1,1641763189162483715,Please block me before 9pm. 🙏🙏🙏 \n\nFor your o...
2,1636766862414430225,@stockx @Nike 6/6\n\n#Phygital NFTs have the p...
3,1634232094766563328,@Leecanskate Thank you
4,1638952055866794007,@ggreenwald His last name is really Langley???...


In [11]:
# BERT_sentiment_calc
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
config = AutoConfig.from_pretrained(MODEL)

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def split_text_to_chunks(text, max_chunk_tokens=510):
    """Split text into chunks of tokens (max 510 tokens per chunk)."""
    if not text or not isinstance(text, str):
        return []

    tokens = tokenizer.tokenize(text)
    chunks = []
    current_chunk = []

    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_chunk_tokens:
            chunks.append(current_chunk)
            current_chunk = []

    if current_chunk:  # Add the last chunk
        chunks.append(current_chunk)

    return chunks

def get_sentiment_for_chunk(chunk_tokens):
    text = tokenizer.convert_tokens_to_string(chunk_tokens)  # Convert tokens to string
    text = preprocess(text)  # Preprocess the reconstructed string
    encoded_input = tokenizer(text, return_tensors='pt').to(model.device)  # Send to correct device
    output = model(**encoded_input)
    scores = output.logits[0].detach().cpu().numpy()  # safer to access logits explicitly
    return softmax(scores)

def get_sentiment_for_tweet(tweet):
    """Simulated processing logic."""
    chunks = split_text_to_chunks(tweet)
    if not chunks:
        return None
    list_score = [get_sentiment_for_chunk(chunk) for chunk in chunks]
    return [sum(col) / len(col) for col in zip(*list_score)]   # Average score

# def get_sentiment_for_tweet(tweet): # weighted average - weights are based on the max deviation from neutral (intensity)
#     chunks = split_text_to_chunks(tweet)
#     if not chunks:
#         return None

#     list_score = [get_sentiment_for_chunk(chunk) for chunk in chunks]

#     # Calculate intensity (distance from neutral)
#     intensities = [abs(score[0] - 0.5) + abs(score[2] - 0.5) for score in list_score]
#     total_intensity = sum(intensities)
#     if total_intensity == 0:
#         weights = [1 / len(list_score)] * len(list_score)  # fallback to average
#     else:
#         weights = [i / total_intensity for i in intensities]

#     # Weighted average
#     weighted_score = [0.0] * len(list_score[0])
#     for w, score in zip(weights, list_score):
#         for i in range(len(score)):
#             weighted_score[i] += w * score[i]

#     return weighted_score

# Adding wrapper to count and print progress
def sentiment_wrapper(tweet):
    global processed_counter
    processed_counter += 1
    if processed_counter % 1000 == 0:
        print(f"[Progress] Processed {processed_counter} rows.")
    return get_sentiment_for_tweet(tweet)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
processed_counter = 0
scores = data['tweet'].apply(sentiment_wrapper)

[Progress] Processed 1000 rows.
[Progress] Processed 2000 rows.
[Progress] Processed 3000 rows.
[Progress] Processed 4000 rows.
[Progress] Processed 5000 rows.
[Progress] Processed 6000 rows.
[Progress] Processed 7000 rows.
[Progress] Processed 8000 rows.
[Progress] Processed 9000 rows.
[Progress] Processed 10000 rows.
[Progress] Processed 11000 rows.
[Progress] Processed 12000 rows.
[Progress] Processed 13000 rows.
[Progress] Processed 14000 rows.
[Progress] Processed 15000 rows.
[Progress] Processed 16000 rows.
[Progress] Processed 17000 rows.
[Progress] Processed 18000 rows.
[Progress] Processed 19000 rows.
[Progress] Processed 20000 rows.
[Progress] Processed 21000 rows.
[Progress] Processed 22000 rows.
[Progress] Processed 23000 rows.
[Progress] Processed 24000 rows.
[Progress] Processed 25000 rows.
[Progress] Processed 26000 rows.
[Progress] Processed 27000 rows.
[Progress] Processed 28000 rows.
[Progress] Processed 29000 rows.
[Progress] Processed 30000 rows.
[Progress] Processe

#### Extract 3-dim sentiment score and label straight from BERT model

In [13]:
# Create sentiment score and label column
data['sentiment_BERT'] = scores
data['sentiment_score'] =  data['sentiment_BERT'].apply(lambda row: row[np.argsort(row)[-1]])
data['sentiment_label'] = data['sentiment_BERT'].apply(lambda row: 'negative' if np.argsort(row)[-1] == 0 else ('neutral' if np.argsort(row)[-1] == 1 else 'positive'))

#### Dimensionality reduction
- Transform sentiment from 3 dimension to 2 dimension, ranging [0, 1]
- Transform sentiment from 3 dimension to 1 dimension, ranging [-1, 1]

In [14]:
data['sentiment_2dim_label'] = data['sentiment_BERT'].apply(lambda row: 'negative' if row[0] > row[2] else ('positive' if row[0] < row[2] else 'tie'))
data['sentiment_2dim_score'] = data['sentiment_BERT'].apply(lambda row: row[0] if row[0] > row[2] else (row[2] if row[0] < row[2] else 0))
data['sentiment_1dim'] = data['sentiment_BERT'].apply(lambda row: -row[0] if row[0] > row[2] else (row[2] if row[0] < row[2] else 0))

#### Combine sentiment score with the complete dataframe

In [None]:
# Join this table with the complete table

bigger_file = pd.read_csv("Test Data/Filtered_Tweets_Test.csv")
df_new = bigger_file.merge(data[['id','sentiment_BERT','sentiment_score','sentiment_label', 'sentiment_2dim_label', 'sentiment_2dim_score', 'sentiment_1dim']], on='id', how='left')
df_new.shape

#### Create topic_new category
- Re-categorize topic, put topic_score < 0.8 to "Other"

In [37]:
def map_topic(row):
    if row['topic_score'] < 0.8: # 1 standard deviation
        return "Other"
    if row['topic'] == "news_&_social_concern":
        return "politics"
    elif row['topic'] == "diaries_&_daily_life":
        return "diaries"
    elif row['topic'] == "sports":
        return "sports"
    elif row['topic'] in ["film_tv_&_video", "music"]:
        return "entertainment"
    else:
        return "Other"

df_new['topic_new'] = df_new.apply(map_topic, axis=1)
df_new['hashtag_bool'] = df_new['hashtag_count'] > 0

In [None]:
# # save a copy to csv
# df_new.reset_index(drop=True).to_csv("Test Data/Complete_Test_data.csv")

#### Apply filters
- filter 'follower_count'
- filter 'word_count'
- filter out replies

In [38]:
threshold_follower_95 = df_new['follower_count'].quantile(0.95)
# threshold_likes_95 = df_new['likes'].quantile(0.95)
# threshold_retweets_95 = df_new['retweets'].quantile(0.95)
# threshold_replies_95 = df_new['replies'].quantile(0.95)
# threshold_quotes_95 = df_new['quotes'].quantile(0.95)

# Filter based on thresholds
df_filtered = df_new[
    (df_new['follower_count'] < threshold_follower_95) &
    (df_new['follower_count'] > 100) &
    (df_new['word_count'] > 2) &
    (df_new['word_count'] < 75)]

# Filter out replies and create hashtag_bool column
df_filtered = df_filtered[
    df_filtered['is_reply'] == False
]

In [39]:

df_filtered.reset_index(drop=True).to_csv("Test Data/Complete_Test_data_filtered.csv")

In [None]:
# df_new = pd.read_csv("English_AllTweet_Sample/complete_270k.csv", index_col=False).drop(columns=['Unnamed: 0','sentiment_1dim'])
# df_new['sentiment_BERT'] = df_new['sentiment_BERT'].apply(lambda row: json.loads(row))