In [1]:
import re
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser
from collections import defaultdict
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

2024-02-21 03:11:33.679763: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
df = pd.read_csv('yahoo.csv')

In [84]:
df.head()

Unnamed: 0,Date,ticker,category,title,content,Open,High,Low,Close,Adj Close,Volume
1970-01-01 00:00:00.000015974,2012-07-23,AAPL,opinion,Trade Apple After Earnings,It may look like a spider web but the mishmas...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015973,2012-07-23,AAPL,opinion,Apple Earnings Preview Quarterly Dip On Deck,Last quarter Apple AAPL reported the 2nd bes...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015972,2012-07-23,AAPL,opinion,Summer Heat Scorches Europe And U S,Europe flares as summer heat continues Summer...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015967,2012-07-24,AAPL,opinion,Market Bait And Switch,That is the sound we are going to hear soon fr...,21.692142,21.774286,21.375357,21.46143,18.324455,565132400
1970-01-01 00:00:00.000015968,2012-07-27,AAPL,opinion,Will AAPL Fall From The Tree,Apple s AAPL sales for the third quarter mis...,20.536072,20.922501,20.413929,20.898571,17.84387,403936400


In [5]:
df['Date'] = pd.to_datetime(df['Date'])

In [16]:
features = df[['content','Date']]

In [None]:
features.head()

In [18]:
features['Date'] = pd.to_datetime(features['Date'])

train_df = features[features['Date'] < pd.to_datetime('2015-01-01')]
test_df = features[features['Date'] >= pd.to_datetime('2015-01-01')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Date'] = pd.to_datetime(features['Date'])


In [20]:
pipe = pipeline("text-classification", model="ProsusAI/finbert")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [21]:
def analyze_sentiment(dataframe, datelabel, textlabel):
    sentiments = defaultdict(lambda: {'pos': [], 'neg': [], 'neu': []})

    for index, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        text = [row[textlabel]]

        inputs = tokenizer(text, return_tensors="pt")
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=-1)

        positive = probs[:, 0].tolist()
        negative = probs[:, 1].tolist()
        neutral = probs[:, 2].tolist()

        sentiments[row[datelabel]]['pos'].extend(positive)
        sentiments[row[datelabel]]['neg'].extend(negative)
        sentiments[row[datelabel]]['neu'].extend(neutral)

    return sentiments


In [36]:
def compute_average_sentiments(sentiments):
    accumulated_sentiments = {}
    
    for data, scores in sentiments.items():
        accumulated_scores = {'pos': 0.0, 'neg': 0.0, 'neu': 0.0, 'count': 0}
        
        for i in range(len(scores['pos'])):
            pos_score, neg_score, neu_score = scores['pos'][i], scores['neg'][i], scores['neu'][i]
            accumulated_scores['pos'] += pos_score
            accumulated_scores['neg'] += neg_score
            accumulated_scores['neu'] += neu_score
            accumulated_scores['count'] += 1
            
        averaged_scores = {
            'pos': accumulated_scores['pos'] / accumulated_scores['count'] if accumulated_scores['count'] > 0 else accumulated_scores['pos'],
            'neg': accumulated_scores['neg'] / accumulated_scores['count'] if accumulated_scores['count'] > 0 else accumulated_scores['neg'],
            'neu': accumulated_scores['neu'] / accumulated_scores['count'] if accumulated_scores['count'] > 0 else accumulated_scores['neu'],
        }
        
        accumulated_sentiments[data] = averaged_scores
        
        
    return accumulated_sentiments


In [37]:
sentiments = analyze_sentiment(features, 'Date', 'title')
sentiments_dict = compute_average_sentiments(sentiments)

100%|█████████████████████████████████████| 15975/15975 [10:29<00:00, 25.38it/s]


In [38]:
sentiments_dict

{Timestamp('2020-01-27 00:00:00'): {'pos': 0.44727897364646196,
  'neg': 0.16598930780310184,
  'neu': 0.38673172635026276},
 Timestamp('2020-01-24 00:00:00'): {'pos': 0.29445303068496287,
  'neg': 0.04595968429930508,
  'neu': 0.6595872915349901},
 Timestamp('2020-01-23 00:00:00'): {'pos': 0.39081306258837384,
  'neg': 0.11763703343846525,
  'neu': 0.4915499060104291},
 Timestamp('2020-01-22 00:00:00'): {'pos': 0.23135214699432255,
  'neg': 0.24329587298755845,
  'neu': 0.5253519743680954},
 Timestamp('2020-01-21 00:00:00'): {'pos': 0.32245114277447423,
  'neg': 0.1912914031155167,
  'neu': 0.48625744741049504},
 Timestamp('2020-01-17 00:00:00'): {'pos': 0.21195709180425515,
  'neg': 0.13527941864661194,
  'neu': 0.6527634706686843},
 Timestamp('2020-01-16 00:00:00'): {'pos': 0.200909467227757,
  'neg': 0.1320876222724716,
  'neu': 0.6670029036700725},
 Timestamp('2020-01-15 00:00:00'): {'pos': 0.4042472666595131,
  'neg': 0.08157592068891972,
  'neu': 0.5141768073663116},
 Timestamp(

In [39]:
def visualize_sentiment_data(sentiments_dict):

    visualization_data = pd.DataFrame(list(sentiments_dict.items()), columns=['Date', 'sentiment'])
    visualization_data[['pos', 'neg', 'neu']] = pd.DataFrame(visualization_data['sentiment'].tolist(), index=visualization_data.index)
    visualization_data = visualization_data.drop(columns=['sentiment'])
    
    return visualization_data



Generating a dataframe and saving it to a pickle file for later use.

In [40]:

visualization_df = visualize_sentiment_data(sentiments_dict)

visualization_df

Unnamed: 0,Date,pos,neg,neu
0,2020-01-27,0.447279,0.165989,0.386732
1,2020-01-24,0.294453,0.045960,0.659587
2,2020-01-23,0.390813,0.117637,0.491550
3,2020-01-22,0.231352,0.243296,0.525352
4,2020-01-21,0.322451,0.191291,0.486257
...,...,...,...,...
1649,2012-07-24,0.031984,0.038037,0.929979
1650,2012-07-27,0.040444,0.418283,0.541273
1651,2012-07-30,0.198772,0.141263,0.659965
1652,2012-07-31,0.366279,0.026320,0.607401


In [44]:
visualization_df.to_csv('Extracted_features.csv', index=False)

In [62]:
df_news = concatenated_df

In [66]:
df_news['Date'] = pd.to_datetime(df_news['Date'])

In [69]:
df = df_news

In [76]:
df['Date'] = pd.to_datetime(df['Date'])

df.sort_values(by='Date', ascending=True, inplace=True)
dates_list = df['Date'].tolist()
print(dates_list[:20]) 

[Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-24 00:00:00'), Timestamp('2012-07-27 00:00:00'), Timestamp('2012-07-30 00:00:00'), Timestamp('2012-07-31 00:00:00'), Timestamp('2012-07-31 00:00:00'), Timestamp('2012-08-10 00:00:00'), Timestamp('2012-08-14 00:00:00'), Timestamp('2012-08-14 00:00:00'), Timestamp('2012-08-15 00:00:00'), Timestamp('2012-08-16 00:00:00'), Timestamp('2012-08-17 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-22 00:00:00'), Timestamp('2012-08-22 00:00:00'), Timestamp('2012-08-23 00:00:00')]


In [77]:
dates_list = concatenated_df['Date'].tolist()
print(dates_list[:20]) 

[Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-23 00:00:00'), Timestamp('2012-07-24 00:00:00'), Timestamp('2012-07-27 00:00:00'), Timestamp('2012-07-30 00:00:00'), Timestamp('2012-07-31 00:00:00'), Timestamp('2012-07-31 00:00:00'), Timestamp('2012-08-10 00:00:00'), Timestamp('2012-08-14 00:00:00'), Timestamp('2012-08-14 00:00:00'), Timestamp('2012-08-15 00:00:00'), Timestamp('2012-08-16 00:00:00'), Timestamp('2012-08-17 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-21 00:00:00'), Timestamp('2012-08-22 00:00:00'), Timestamp('2012-08-22 00:00:00'), Timestamp('2012-08-23 00:00:00')]


In [86]:
df_news

Unnamed: 0,Date,ticker,category,title,content,Open,High,Low,Close,Adj Close,Volume
1970-01-01 00:00:00.000015974,2012-07-23,AAPL,opinion,Trade Apple After Earnings,It may look like a spider web but the mishmas...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015973,2012-07-23,AAPL,opinion,Apple Earnings Preview Quarterly Dip On Deck,Last quarter Apple AAPL reported the 2nd bes...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015972,2012-07-23,AAPL,opinion,Summer Heat Scorches Europe And U S,Europe flares as summer heat continues Summer...,21.228571,21.639286,20.989643,21.565357,18.413187,487975600
1970-01-01 00:00:00.000015967,2012-07-24,AAPL,opinion,Market Bait And Switch,That is the sound we are going to hear soon fr...,21.692142,21.774286,21.375357,21.461430,18.324455,565132400
1970-01-01 00:00:00.000015968,2012-07-27,AAPL,opinion,Will AAPL Fall From The Tree,Apple s AAPL sales for the third quarter mis...,20.536072,20.922501,20.413929,20.898571,17.843870,403936400
...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.000000005,2020-01-27,AAPL,opinion,Top Stock Analyst Reports For Merck Broadcom ...,Monday January 27 2020The Zacks Research Dai...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000
1970-01-01 00:00:00.000000004,2020-01-27,AAPL,opinion,Buy Surging Apple Microsoft Stock Before Qua...,On today s episode of Full Court Finance here ...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000
1970-01-01 00:00:00.000000003,2020-01-27,AAPL,opinion,Apple Earnings Preview 5G Launch Expanding S...,Reports Q1 2020 results on Tuesday Jan 28 ...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000
1970-01-01 00:00:00.000000002,2020-01-27,AAPL,opinion,7 Monster Stock Market Predictions For The Wee...,S P 500 SPY \nThis week will be packed with e...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000


In [81]:
visualization_df

Unnamed: 0,Date,pos,neg,neu
0,2020-01-27,0.447279,0.165989,0.386732
1,2020-01-24,0.294453,0.045960,0.659587
2,2020-01-23,0.390813,0.117637,0.491550
3,2020-01-22,0.231352,0.243296,0.525352
4,2020-01-21,0.322451,0.191291,0.486257
...,...,...,...,...
1649,2012-07-24,0.031984,0.038037,0.929979
1650,2012-07-27,0.040444,0.418283,0.541273
1651,2012-07-30,0.198772,0.141263,0.659965
1652,2012-07-31,0.366279,0.026320,0.607401


In [101]:
main_df = pd.read_csv('data_with_finbert.csv')

In [125]:
concatenated_df_safe= df

In [129]:
final_df = pd.merge(concatenated_df_safe, visualization_df_safe, how='outer', on='Date')
#
            