In [18]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import urllib
import os
from argparse import Namespace
from urllib.request import urlopen
import time
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
import torch
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\m1273747\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
df = pd.read_csv('horoscopes.csv', 
                         sep='|', 
                         on_bad_lines='skip',
                         names=["ID", "COMMENT", "DATE", "SIGN"])
df = df.dropna(subset=['COMMENT'])
df = df.reset_index()
df = df.drop(columns=['ID', 'index'])
df.index.name = 'ID'
df = df.head(50)
print(df.shape)
print(df.head(3))

(50, 3)
                                              COMMENT        DATE   SIGN
ID                                                                      
0   You’re not the sort to play safe and even if y...  12-01-2013  aries
1   There is no such thing as something for nothin...  12-02-2013  aries
2   As the new moon falls in one of the more adven...  12-03-2013  aries


In [20]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [21]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [22]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [23]:
def polarity_scores_roberta(text):  
    encoded_text = tokenizer(text,return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta' : (scores[0]*-1+scores[2])/(scores[0]+scores[1]+scores[2])
    }
    return scores_dict

In [24]:
res_vaders = {} #dictionary
res_roberta = {} #dictionary

for i in df.index:
    comment = df['COMMENT'][i]
    myid = i
    res_vaders[myid] = sia.polarity_scores(comment)
    res_roberta[myid] = polarity_scores_roberta(comment)

In [25]:
vaders = pd.DataFrame(res_vaders).T
vaders.index.name = 'ID'
vaders = vaders.drop(columns=['neg', 'neu', 'pos'])
vaders.rename(columns = {'compound':'vaders'}, inplace = True)
vaders = pd.merge(vaders, df, how ='inner', on= ['ID'], validate='one_to_one')
#vaders = vaders.drop(columns=['COMMENT'])
vaders

    vaders                                            COMMENT        DATE  \
ID                                                                          
0   0.7102  You’re not the sort to play safe and even if y...  12-01-2013   
1   0.3400  There is no such thing as something for nothin...  12-02-2013   
2   0.4939  As the new moon falls in one of the more adven...  12-03-2013   
3   0.9581  You will hear something amazing today but can ...  12-04-2013   
4   0.8268  A friend or colleague you have not seen for a ...  12-05-2013   
5   0.1154  You may be a nice guy by nature but most likel...  12-06-2013   
6  -0.2500  Someone will give you a tough time this weeken...  12-07-2013   
7   0.3071  You will be restless this coming week, without...  12-08-2013   
8   0.9153  You may be open and honest by nature but is le...  12-09-2013   
9  -0.4767  You must avoid wishful thinking today. Cosmic ...  12-10-2013   
10  0.8225  If you have not made as much progress as you w...  12-11-2013   

In [33]:
print(vaders['vaders'][47], " : ", vaders['COMMENT'][47])
print(vaders['vaders'][48], " : ", vaders['COMMENT'][48])
print(vaders['vaders'][49], " : ", vaders['COMMENT'][49])

0.8591  :  Whatever you happen to be working on now you will get it done quicker and better if you get other people involved. But they must be involved as equals. It may be your vision, your baby, but don’t let your ego get in the way of success.
0.8805  :  Even if you prefer to walk alone it will pay you to get more involved in group activities over the next few days. Friends and social contacts will bring you luck and make life more enjoyable, so get out there and have some fun.
0.4579  :  You may not agree with what you hear today but you must let others have their say. Free speech is worth fighting for, so if someone tries to shout down a dissenting opinion make it your business to defend their right to be different.


In [34]:
roberta = pd.DataFrame(res_roberta).T
roberta.index.name = 'ID'
roberta = pd.merge(roberta, df, how ='inner', on= ['ID'], validate='one_to_one')
#roberta = roberta.drop(columns=['COMMENT'])

print(roberta['roberta'][47], " : ", roberta['COMMENT'][47])
print(roberta['roberta'][48], " : ", roberta['COMMENT'][48])
print(roberta['roberta'][49], " : ", roberta['COMMENT'][49])

0.7539331475272775  :  Whatever you happen to be working on now you will get it done quicker and better if you get other people involved. But they must be involved as equals. It may be your vision, your baby, but don’t let your ego get in the way of success.
0.9516182392835617  :  Even if you prefer to walk alone it will pay you to get more involved in group activities over the next few days. Friends and social contacts will bring you luck and make life more enjoyable, so get out there and have some fun.
-0.08593526482582092  :  You may not agree with what you hear today but you must let others have their say. Free speech is worth fighting for, so if someone tries to shout down a dissenting opinion make it your business to defend their right to be different.


In [14]:
merged = pd.merge(roberta, vaders, how ='inner', on= ['DATE', 'SIGN'], validate='one_to_one')
merged.head(3)

Unnamed: 0,roberta_com,DATE,SIGN,vaders
0,-0.22426,12-01-2013,aries,0.7102
1,-0.250313,12-02-2013,aries,0.34
2,0.716438,12-03-2013,aries,0.4939
