In [52]:
# newsAPI     =    0b1ea46e336d40e1abc48f51e01c253a

In [53]:
import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm import tqdm
from collections import defaultdict
from datetime import timedelta


## load the data

In [54]:
df = pd.read_csv('testData.csv')

# convert the publish_date column to datetime64
df['publish_date'] = pd.to_datetime(df['publish_date'])

# convert remaining cols to string
df['headline_text'] = df['headline_text'].astype(str)
df['description'] = df['description'].astype(str)


headlines_df = df.copy()
headlines_df['headline_sentiment'] = 0
headlines_df['description_sentiment'] = 0
# headlines_df['pos_sentiment'] = 0
# headlines_df['neg_sentiment'] = 0

print(headlines_df.dtypes)
headlines_df.head()

total_headlines = len(headlines_df)

publish_date             datetime64[ns, UTC]
headline_text                         object
description                           object
headline_sentiment                     int64
description_sentiment                  int64
dtype: object


## load the model

In [55]:
specific_model = pipeline(
    model="siebert/sentiment-roberta-large-english")  # Use device 0 for GPU

sentiments = defaultdict(lambda: defaultdict(int))

In [56]:
with tqdm(total=total_headlines, desc="Analysing Sentiments", unit="headline", dynamic_ncols=True) as pbar:
    for idx in range(total_headlines):
        row = headlines_df.iloc[idx]
        headline = row['headline_text']
        description = row['description']
        # sentiment_date = pd.to_datetime(row['publish_date'], format='%Y-%m-%d')
        # sentiment_month = sentiment_date.replace(
        #     day=1, hour=0, minute=0, second=0, microsecond=0)
        result = specific_model(headline)
        label = result[0]['label']

        # if label == 'POSITIVE':
        #     sentiments[sentiment_month]['POSITIVE'] += 1
        # elif label == 'NEGATIVE':
        #     sentiments[sentiment_month]['NEGATIVE'] += 1

        if label == 'POSITIVE':
            headlines_df.at[idx, 'headline_sentiment'] = 1
        elif label == 'NEGATIVE':
            headlines_df.at[idx, 'headline_sentiment'] = -1
        # print("=====")
        # print(row)
        # print("=====")
            
        result = specific_model(description)
        label = result[0]['label']

        if label == 'POSITIVE':
            headlines_df.at[idx, 'description_sentiment'] = 1
        elif label == 'NEGATIVE':
            headlines_df.at[idx, 'description_sentiment'] = -1
            
            

        pbar.update(1)

Analysing Sentiments: 100%|██████████| 4/4 [00:01<00:00,  2.61headline/s]


In [58]:
# print(sentiments)
# print( dict(sentiments) )

headlines_df.head()

Unnamed: 0,publish_date,headline_text,description,headline_sentiment,description_sentiment
0,2024-01-28 04:50:27+00:00,Elon Musk joins Trump Republicans to slam rumo...,Elon Musk joined Donald Trump and Republican c...,-1,-1
1,2024-01-28 00:10:36+00:00,Tesla battery explodes in Cary home after bein...,Tesla battery explodes in Cary home after bein...,-1,-1
2,2024-01-27 21:01:05+00:00,Here's why Biden's multi-billion-dollar EV cha...,By Will Kessler Daily Caller News Foundation T...,-1,-1
3,2024-01-27 17:30:00+00:00,Who wants to be a trillionaire': The game show...,A trillion dollars can purchase shares of all...,1,1


In [57]:
# combined_data = {
#     'interval': [],
#     'publish_date': [],
#     'Positive': [],
#     'Negative': []
# }

# for sentiment_month, sentiment_counts in sentiments.items():
#     combined_data['interval'].append(sentiment_month)
#     combined_data['publish_date'].append(sentiment_month)
#     combined_data['Positive'].append(sentiment_counts['POSITIVE'])
#     combined_data['Negative'].append(sentiment_counts['NEGATIVE'])

# combined_data = pd.DataFrame(combined_data)
# combined_data['interval'] = pd.to_datetime(combined_data['interval'])
# combined_data['interval'] = combined_data['interval'].dt.to_period('Q')

# # Group and aggregate by 3-month intervals
# grouped_data = combined_data.groupby('interval', as_index=False).agg({
#     'publish_date': 'min',
#     'Positive': 'sum',
#     'Negative': 'sum'
# })

# grouped_data.head()