In [None]:
!pip install transformers scipy pandas plotly ekphrasis numpy torch

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import plotly
from plotly import graph_objects as go
import plotly.express as px
import ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm

In [None]:
def preprocess(text):
    '''This function takes in text and removes links and usernames from it.
    Parameters
    ----------
    text : string of unprocessed text (contains links and usernames)
    '''
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
    

# You can run alternate versions of this model specialized to different tasks
# emoji, emotion, hate, irony, offensive, sentiment
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# Load pretrained Pytorch model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

df = pd.read_csv('news_tweets.csv')

def analyze(text, model, no_neutral=False, raw = False):
    '''Take in text and the model as inputs and returns a score
    '''
    processed_text = preprocess(text)
    encoded_input = tokenizer(processed_text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

# For Twitter, where the .csv files are already seperated by categories:
outputs=[]                        
labels = {0: 'Negative', 1:'Neutral', 2:'Positive'}

for i in tqdm(df['Body']):
    res = analyze(i, model, raw=False, no_neutral=True)
    outputs.append([i, res])
    break
# From this, we obtained all the information we used in our study about the sentiment of tweets


In [None]:
outputs