In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import glob

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
sentiment_model_path = f"cardiffnlp/twitter-roberta-base-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_path)

In [4]:
# Download label mapping
sentiment_labels = []
sentiment_mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(sentiment_mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
sentiment_labels = [row[1] for row in csvreader if len(row) > 1]

In [5]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(
    sentiment_model_path)
sentiment_model.save_pretrained(sentiment_model_path)
sentiment_tokenizer.save_pretrained(sentiment_model_path)

('cardiffnlp/twitter-roberta-base-sentiment\\tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment\\added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\tokenizer.json')

In [6]:
# Constants
PHARMA_PATH = '../../data/twitter/extra data/pharma companies'
GOVT_INSTITUTES_PATH = '../../data/twitter/extra data/public health agencies'
NGO_PATH = '../../data/twitter/extra data/ngo'

In [7]:
# for file in glob.glob(PHARMA_PATH+'/*.csv'):
#     user_df = pd.read_csv(file)
#     username = user_df['username'].unique()[0]
    
#     for index, row in user_df.iterrows():
#         tweet = row['tweet']
#         tweet = preprocess(tweet)

#         # Calculate sentiment
#         sentiment_encoded_input = sentiment_tokenizer(
#             tweet, return_tensors='pt')
#         sentiment_output = sentiment_model(**sentiment_encoded_input)
#         sentiment_scores = sentiment_output[0][0].detach().numpy()
#         sentiment_scores = softmax(sentiment_scores)

#         sentiment_ranking = np.argsort(sentiment_scores)
#         sentiment_ranking = sentiment_ranking[::-1]
#         # print(sentiment_labels[sentiment_ranking[0]])
#         for i in range(sentiment_scores.shape[0]):
#             sentiment_label = sentiment_labels[sentiment_ranking[i]]
#             sentiment_score = sentiment_scores[sentiment_ranking[i]]
#             # print(f'{sentiment_label} {np.round(float(sentiment_score), 4)}')
#             user_df.at[index, sentiment_label] = np.round(
#                 float(sentiment_score), 6)
#         user_df.at[index, 'sentiment'] = sentiment_labels[sentiment_ranking[0]]
    
#     user_df.to_csv(username+'.csv', index=False)

In [8]:
for file in glob.glob(GOVT_INSTITUTES_PATH+'/*.csv'):
    user_df = pd.read_csv(file)
    username = user_df['username'].unique()[0]
    
    for index, row in user_df.iterrows():
        tweet = row['tweet']
        tweet = preprocess(tweet)

        # Calculate sentiment
        sentiment_encoded_input = sentiment_tokenizer(
            tweet, return_tensors='pt')
        sentiment_output = sentiment_model(**sentiment_encoded_input)
        sentiment_scores = sentiment_output[0][0].detach().numpy()
        sentiment_scores = softmax(sentiment_scores)

        sentiment_ranking = np.argsort(sentiment_scores)
        sentiment_ranking = sentiment_ranking[::-1]
        # print(sentiment_labels[sentiment_ranking[0]])
        for i in range(sentiment_scores.shape[0]):
            sentiment_label = sentiment_labels[sentiment_ranking[i]]
            sentiment_score = sentiment_scores[sentiment_ranking[i]]
            # print(f'{sentiment_label} {np.round(float(sentiment_score), 4)}')
            user_df.at[index, sentiment_label] = np.round(
                float(sentiment_score), 6)
        user_df.at[index, 'sentiment'] = sentiment_labels[sentiment_ranking[0]]
    
    user_df.to_csv(username+'.csv', index=False)

In [None]:
# for file in glob.glob(NGO_PATH+'/*.csv'):
#     user_df = pd.read_csv(file)
#     username = user_df['username'].unique()[0]
    
#     for index, row in user_df.iterrows():
#         if(isinstance(row.tweet, float)):
#             row.tweet = str(row.tweet)
#         tweet = row['tweet']
#         tweet = preprocess(tweet)

#         # Calculate sentiment
#         sentiment_encoded_input = sentiment_tokenizer(
#             tweet, return_tensors='pt')
#         sentiment_output = sentiment_model(**sentiment_encoded_input)
#         sentiment_scores = sentiment_output[0][0].detach().numpy()
#         sentiment_scores = softmax(sentiment_scores)

#         sentiment_ranking = np.argsort(sentiment_scores)
#         sentiment_ranking = sentiment_ranking[::-1]
#         # print(sentiment_labels[sentiment_ranking[0]])
#         for i in range(sentiment_scores.shape[0]):
#             sentiment_label = sentiment_labels[sentiment_ranking[i]]
#             sentiment_score = sentiment_scores[sentiment_ranking[i]]
#             # print(f'{sentiment_label} {np.round(float(sentiment_score), 4)}')
#             user_df.at[index, sentiment_label] = np.round(
#                 float(sentiment_score), 6)
#         user_df.at[index, 'sentiment'] = sentiment_labels[sentiment_ranking[0]]
    
#     user_df.to_csv(username+'.csv', index=False)