In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import urllib
import os
from argparse import Namespace
from urllib.request import urlopen
import time
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
import torch

In [2]:
df = pd.read_csv('horoscopes.csv', 
                         sep='|', 
                         on_bad_lines='skip',
                         names=["ID", "COMMENT", "DATE", "SIGN"])
df = df.dropna(subset=['COMMENT'])
df = df.reset_index()
df = df.drop(columns=['ID', 'index'])
df.index.name = 'ID'
print(df.shape)
print(df.head(3))

(12946, 3)
                                              COMMENT        DATE   SIGN
ID                                                                      
0   You’re not the sort to play safe and even if y...  12-01-2013  aries
1   There is no such thing as something for nothin...  12-02-2013  aries
2   As the new moon falls in one of the more adven...  12-03-2013  aries


In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [15]:
def polarity_scores_roberta(text): 
    encoded_text = tokenizer(text,return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta' : (scores[0]*-1+scores[2])/(scores[0]+scores[1]+scores[2])
    }
    return scores_dict

In [16]:
res_vaders = {} #dictionary
res_roberta = {} #dictionary

for i in df.index:
    comment = df['COMMENT'][i]
    myid = i
    res_vaders[myid] = sia.polarity_scores(comment)
    res_roberta[myid] = polarity_scores_roberta(comment)

KeyboardInterrupt: 

In [11]:
vaders = pd.DataFrame(res_vaders).T
vaders.index.name = 'ID'
vaders = vaders.drop(columns=['neg', 'neu', 'pos'])
vaders.rename(columns = {'compound':'vaders'}, inplace = True)
vaders = pd.merge(vaders, df, how ='inner', on= ['ID'], validate='one_to_one')
vaders = vaders.drop(columns=['COMMENT'])
print(vaders.head(3))

    vaders        DATE   SIGN
ID                           
0   0.7102  12-01-2013  aries
1   0.3400  12-02-2013  aries
2   0.4939  12-03-2013  aries


In [12]:
roberta = pd.DataFrame(res_roberta).T
roberta.index.name = 'ID'
roberta = pd.merge(roberta, df, how ='inner', on= ['ID'], validate='one_to_one')
roberta = roberta.drop(columns=['COMMENT'])
print(roberta.head(3))

    roberta_com        DATE   SIGN
ID                                
0     -0.224260  12-01-2013  aries
1     -0.250313  12-02-2013  aries
2      0.716438  12-03-2013  aries


In [14]:
merged = pd.merge(roberta, vaders, how ='inner', on= ['DATE', 'SIGN'], validate='one_to_one')
merged.head(3)

Unnamed: 0,roberta_com,DATE,SIGN,vaders
0,-0.22426,12-01-2013,aries,0.7102
1,-0.250313,12-02-2013,aries,0.34
2,0.716438,12-03-2013,aries,0.4939
