In [4]:
!pip install transformers scipy pandas plotly ekphrasis numpy torch

Collecting transformers
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting ekphrasis
  Using cached ekphrasis-0.5.4-py3-none-any.whl (83 kB)
Collecting filelock
  Using cached filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting regex!=2019.12.17
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
Collecting termcolor
  Using cached termcolor-2.1.1-py3-none-any.whl (6.2 kB)
Collecting ftfy
  Using cached ftfy-6.1.1-py3-none-any.whl (53 kB)
Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting ujson
  Using cached ujson-5.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (45 kB)
Installing collected packages: ujson, termcolor, regex, ftfy, filelock, huggingface-hub, transformers, nltk, ekphrasis
Successfully installed ekphrasis-0.5.4 filelock-3.8.0 ftfy-6.1.1 huggingface-hub-0.11.1 nlt

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import plotly
from plotly import graph_objects as go
import plotly.express as px
import ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm

In [16]:
def preprocess(text):
    '''This function takes in text and removes links and usernames from it.
    Parameters
    ----------
    text : string of unprocessed text (contains links and usernames)
    '''
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
    

# You can run alternate versions of this model specialized to different tasks
# emoji, emotion, hate, irony, offensive, sentiment
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# Load pretrained Pytorch model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

df = pd.read_csv('news_tweets.csv')

def analyze(text, model, no_neutral=False, raw = False):
    '''Take in text and the model as inputs and returns a score
    '''
    processed_text = preprocess(text)
    encoded_input = tokenizer(processed_text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

# For Twitter, where the .csv files are already seperated by categories:
outputs=[]                        
labels = {0: 'Negative', 1:'Neutral', 2:'Positive'}

for i in tqdm(df['Body']):
    res = analyze(i, model, raw=False, no_neutral=True)
    outputs.append([i, res])
    break
# From this, we obtained all the information we used in our study about the sentiment of tweets


  0%|          | 0/1099 [00:00<?, ?it/s]


In [17]:
outputs

[['#Ethiopia Peace Process Undermined as #Eritrea Forces Continue Attacking Civilians - Bloomberg https://t.co/4vIVOxw8Om',
  array([0.7592903 , 0.231484  , 0.00922581], dtype=float32)]]