In [28]:
from newsapi import NewsApiClient
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

In [29]:
local_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(local_dir)
api_env_filepath = os.path.join(parent_dir,'api.env')
load_dotenv(Path(api_env_filepath))

newsapi = NewsApiClient(api_key=os.getenv('NEWSAPI_API_KEY'))

In [30]:
top_us_headlines = newsapi.get_top_headlines(
    country='us',
    category='business',
    language='en',
    page_size=100
)

top_titles = []
for i in top_us_headlines['articles']:
    top_titles.append(i['title'])

In [31]:
top_titles[0:10]

["Elon Musk warns his companies would ban Apple devices after OpenAI deal: 'Unacceptable security violation' - New York Post ",
 "krispy kreme stock pops on truist upgrade for mcdonald's deal - Quartz",
 'Dozens Arrested as Activists Target Citigroup for Bankrolling Climate Breakdown - Common Dreams',
 'This is when you can expect mortgage rates to go down — and why you should buy before a recession hits - New York Post ',
 "World's largest Buc-ee's opens in Texas Hill Country - KHOU.com",
 'Elon Musk pay package: Lawyers who blocked it seek $5.6 billion - Fortune',
 "Paramount For Sale, Who's Buying? Edgar Bronfman Jr Latest To Eye Deal As Skydance Talks Continue - Deadline",
 'Chrysler recalls more than 211000 SUVs and pickup trucks due to software malfunction - The Associated Press',
 'UAW President Fain under investigation by federal court watchdog - Detroit News',
 'OpenAI hires new CFO and product chief, announces Apple deal to integrate ChatGPT - CNBC']

In [5]:
df = pd.read_csv('../Data/sp500.csv')

In [152]:
df_copy = df.copy()

In [153]:
df_copy.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A.O. Smith,Industrials
2,ABT,Abbott Labs,Health Care
3,ABT,Abbott,Health Care
4,ABBV,AbbVie,Health Care


In [154]:
stock_alias_set = set()

for index,row in df_copy.iterrows():
    stock_alias_set.add(row['Name'].lower())

In [159]:
text = 'Many Americans are still shying away from EVs despite Bidens push, an AP-NORC/EPIC poll finds'
kw_extractor = KeywordExtractor()

keywords = kw_extractor.extract_keywords(text)

print(keywords)
print('\n')

found_keyword = False

for k in keywords:
    cur_keyword = k[0].lower()
    if cur_keyword in stock_alias_set:
        found_keyword = True
        print(cur_keyword)

if not found_keyword:
    print('Overall Economy')

[('EPIC poll finds', 0.004390086440759893), ('Bidens push', 0.023458380875189744), ('EPIC poll', 0.026233073037508336), ('EVs despite Bidens', 0.04498862876540802), ('poll finds', 0.04940384002065631), ('EPIC', 0.08596317751626563), ('Americans', 0.1447773057422032), ('Bidens', 0.1447773057422032), ('push', 0.15831692877998726), ('AP-NORC', 0.15831692877998726), ('finds', 0.15831692877998726), ('shying', 0.29736558256021506), ('EVs', 0.29736558256021506), ('poll', 0.29736558256021506)]


Overall Economy


In [6]:
# pip install transformers
# pip install fast_ml
# pip install scipy
# pip install matplotlib
# pip install seaborn
# pip install scikit-learn
# pip install tensorflow
# pip install torch
# pip install datasets
import numpy as np
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from transformers import Trainer, TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import datasets

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device Available: {DEVICE}')

Device Available: cuda


In [3]:
sentiment_data = pd.read_csv('../Data/headline_sentiment_data.csv',encoding="ISO-8859-1")
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [5]:
print(sentiment_data['Sentiment'].value_counts().get('neutral',0))
print(sentiment_data['Sentiment'].value_counts().get('positive',0))
print(sentiment_data['Sentiment'].value_counts().get('negative',0))

2879
1363
604


In [7]:
# Because of an unbalance dataset lets upsample the negative and positive
df_neutral = sentiment_data[sentiment_data['Sentiment'] == 'neutral']
df_positive = sentiment_data[sentiment_data['Sentiment'] == 'positive']
df_negative = sentiment_data[sentiment_data['Sentiment'] == 'negative' ]

df_positive_upsampled = resample(df_positive,
                                 replace=True,
                                 n_samples=2800,
                                 random_state=42)

df_negative_upsampled = resample(df_negative,
                                 replace=True,
                                 n_samples=2800,
                                 random_state=42)

df_balanced = pd.concat([df_neutral, df_positive_upsampled, df_negative_upsampled])

In [9]:
print(df_balanced['Sentiment'].value_counts().get('neutral',0))
print(df_balanced['Sentiment'].value_counts().get('positive',0))
print(df_balanced['Sentiment'].value_counts().get('negative',0))

2879
2800
2800


In [10]:
sentiment_data = df_balanced.copy()

In [12]:
encoding_map = {'negative':0,'neutral':1,'positive':2}
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].map(encoding_map)
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
58,1,At the request of Finnish media company Alma M...
59,1,"In Sweden , Gallerix accumulated SEK denominat..."
60,1,The company supports its global customers in d...


In [14]:
(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels) = train_valid_test_split(sentiment_data, target='Sentiment', train_size=0.8, valid_size=0.1, test_size=0.1)

train_texts = train_texts['Text'].to_list()
train_labels = train_labels.to_list()
val_texts = val_texts['Text'].to_list()
val_labels = val_labels.to_list()
test_texts = test_texts['Text'].to_list()
test_labels = test_labels.to_list()

In [15]:
class DataLoader(torch.utils.data.Dataset):
    def __init__(self, sentences=None, labels=None):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

        if bool(sentences):
            self.encodings = self.tokenizer(self.sentences,truncation=True,padding=True)

    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
        
        if self.labels == None:
            item['labels'] = None
        else:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.sentences)

    def encode(self,x):
        return self.tokenizer(x, return_tensors='pt').to(DEVICE)

In [16]:
train_dataset = DataLoader(train_texts,train_labels)
val_dataset = DataLoader(val_texts,val_labels)
test_dataset = DataLoader(test_texts,test_labels)

print(train_dataset.__getitem__(0))

{'input_ids': tensor([ 101, 1996, 6023, 2415, 2097, 2031, 1037, 3539, 3295, 2157, 2279, 2000,
        1996, 2026, 9215, 5311, 2080, 6005, 2276, 1012,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      

In [17]:
f1 = datasets.load_metric('f1')
accuracy = datasets.load_metric('accuracy')
precision = datasets.load_metric('precision')
recall = datasets.load_metric('recall')

def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    
    metrics_dict.update(f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [18]:
id2label_mapping = {0:'negative',1:'neutral',2:'positive'}
label2id_mapping = {'negative':0,'neutral':1,'positive':2}

config = AutoConfig.from_pretrained('distilbert/distilbert-base-uncased',
                                    num_labels=3,
                                    id2label=id2label_mapping,
                                    label2id=label2id_mapping)

model = AutoModelForSequenceClassification.from_config(config)

In [19]:
training_args = TrainingArguments(
    output_dir='../Data',
    num_train_epochs=25,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.05,
    report_to='none',
    evaluation_strategy='steps',
    logging_dir='../Data',
    logging_steps=500)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

 19%|█▉        | 1000/5300 [05:36<24:48,  2.89it/s]

{'loss': 0.4117, 'grad_norm': 0.2548915147781372, 'learning_rate': 4.4791666666666673e-05, 'epoch': 9.43}


                                                   
 19%|█▉        | 1000/5300 [05:38<24:48,  2.89it/s]

{'eval_loss': 0.4058915376663208, 'eval_f1': 0.9116654084687351, 'eval_accuracy': 0.9127358490566038, 'eval_precision': 0.9146710327170114, 'eval_recall': 0.9136509766281032, 'eval_runtime': 2.0899, 'eval_samples_per_second': 405.759, 'eval_steps_per_second': 6.699, 'epoch': 9.43}


 38%|███▊      | 2000/5300 [11:18<18:39,  2.95it/s]  

{'loss': 0.0244, 'grad_norm': 3.962609052658081, 'learning_rate': 3.4375e-05, 'epoch': 18.87}


                                                   
 38%|███▊      | 2000/5300 [11:20<18:39,  2.95it/s]

{'eval_loss': 0.4130553603172302, 'eval_f1': 0.9290539060554682, 'eval_accuracy': 0.9292452830188679, 'eval_precision': 0.929125816993464, 'eval_recall': 0.9297223444601759, 'eval_runtime': 2.0513, 'eval_samples_per_second': 413.399, 'eval_steps_per_second': 6.825, 'epoch': 18.87}


 57%|█████▋    | 3000/5300 [16:56<12:47,  3.00it/s]  

{'loss': 0.0072, 'grad_norm': 0.004137137439101934, 'learning_rate': 2.3958333333333334e-05, 'epoch': 28.3}


                                                   
 57%|█████▋    | 3000/5300 [16:58<12:47,  3.00it/s]

{'eval_loss': 0.5794388651847839, 'eval_f1': 0.9143182464119913, 'eval_accuracy': 0.9150943396226415, 'eval_precision': 0.9161480481001889, 'eval_recall': 0.9158911199972789, 'eval_runtime': 1.9987, 'eval_samples_per_second': 424.269, 'eval_steps_per_second': 7.004, 'epoch': 28.3}


 75%|███████▌  | 4000/5300 [22:34<07:12,  3.00it/s]

{'loss': 0.0025, 'grad_norm': 0.003968705888837576, 'learning_rate': 1.3541666666666666e-05, 'epoch': 37.74}


                                                   
 75%|███████▌  | 4000/5300 [22:36<07:12,  3.00it/s]

{'eval_loss': 0.6357601881027222, 'eval_f1': 0.9227875088920543, 'eval_accuracy': 0.9233490566037735, 'eval_precision': 0.9243681889123464, 'eval_recall': 0.9240676432947699, 'eval_runtime': 2.0003, 'eval_samples_per_second': 423.94, 'eval_steps_per_second': 6.999, 'epoch': 37.74}


 94%|█████████▍| 5000/5300 [28:12<01:39,  3.00it/s]

{'loss': 0.0006, 'grad_norm': 0.001157345250248909, 'learning_rate': 3.125e-06, 'epoch': 47.17}


                                                   
 94%|█████████▍| 5000/5300 [28:14<01:39,  3.00it/s]

{'eval_loss': 0.6455320715904236, 'eval_f1': 0.9144477733704672, 'eval_accuracy': 0.9150943396226415, 'eval_precision': 0.9160650511660027, 'eval_recall': 0.9158537842744593, 'eval_runtime': 2.0004, 'eval_samples_per_second': 423.918, 'eval_steps_per_second': 6.999, 'epoch': 47.17}


100%|██████████| 5300/5300 [29:55<00:00,  2.95it/s]

{'train_runtime': 1795.3614, 'train_samples_per_second': 188.903, 'train_steps_per_second': 2.952, 'train_loss': 0.08423925535331639, 'epoch': 50.0}





TrainOutput(global_step=5300, training_loss=0.08423925535331639, metrics={'train_runtime': 1795.3614, 'train_samples_per_second': 188.903, 'train_steps_per_second': 2.952, 'total_flos': 1.0529793621396e+16, 'train_loss': 0.08423925535331639, 'epoch': 50.0})

In [22]:
eval_results = trainer.predict(test_dataset)

100%|██████████| 14/14 [00:00<00:00, 14.53it/s]


In [23]:
print(eval_results.metrics)

{'test_loss': 0.7060199975967407, 'test_f1': 0.9152165828679655, 'test_accuracy': 0.9150943396226415, 'test_precision': 0.915450819140749, 'test_recall': 0.9161262559013298, 'test_runtime': 1.0508, 'test_samples_per_second': 807.019, 'test_steps_per_second': 13.323}


In [24]:
trainer.save_model('../Models/sentiment_model')

In [25]:
model_path = '../Models/sentiment_model'

In [26]:
class SentimentModel():
    def __init__(self, model_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)

        args = TrainingArguments(output_dir='../Data/results', per_device_eval_batch_size=64)
        self.batch_model = Trainer(model = self.model, args=args)
        self.single_dataloader = DataLoader()

    def batch_predict_proba(self,x):
        predictions = self.batch_model.predict(DataLoader(x))
        logits = torch.from_numpy(predictions.predictions)

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim=1).to('cpu').detach().numpy()

        return proba

    def predict_proba(self,x):
        x = self.single_dataloader.encode(x).to(DEVICE)
        predictions = self.model(**x)
        logits = predictions.logits

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim=1).to('cpu').detach().numpy()

        return proba


In [32]:
top_titles[0:5]

["Elon Musk warns his companies would ban Apple devices after OpenAI deal: 'Unacceptable security violation' - New York Post ",
 "krispy kreme stock pops on truist upgrade for mcdonald's deal - Quartz",
 'Dozens Arrested as Activists Target Citigroup for Bankrolling Climate Breakdown - Common Dreams',
 'This is when you can expect mortgage rates to go down — and why you should buy before a recession hits - New York Post ',
 "World's largest Buc-ee's opens in Texas Hill Country - KHOU.com"]

In [33]:
new_headlines = pd.DataFrame(top_titles, columns=['headlines'])
new_headlines.head()
len(new_headlines)

60

In [34]:
batch_sentences = new_headlines.sample(n=len(new_headlines),random_state=1)['headlines'].to_list()
single_sentence = new_headlines.sample(n=1,random_state=1)['headlines'].to_list()[0]

In [35]:
sentiment_model = SentimentModel('../Models/sentiment_model')

In [36]:
import re

In [37]:
single_sentence = re.sub(r'[^A-Za-z]+',' ',single_sentence)

In [38]:
single_sentence

'National chain files for bankruptcy after missteps closing multiple Ohio restaurants NOW'

In [39]:
single_sentence_probas = sentiment_model.predict_proba(single_sentence)
id2label = sentiment_model.model.config.id2label
predicted_class_label = id2label[np.argmax(single_sentence_probas)]

print(predicted_class_label)

neutral


In [47]:
batch_sentence_probas = sentiment_model.batch_predict_proba(batch_sentences)
predicted_class_labels = [id2label[i] for i in np.argmax(batch_sentence_probas, axis = -1)]

100%|██████████| 1/1 [00:00<00:00, 1023.00it/s]


In [48]:
for i,j in zip(batch_sentences,predicted_class_labels):
    print(f'sentiment: {j} - {i}')

sentiment: neutral - National chain files for bankruptcy after missteps; closing multiple Ohio restaurants - 614NOW
sentiment: negative - Steve Jobs screamed advice at Starbucks CEO, who wishes he'd listened - AppleInsider
sentiment: neutral - Dozens Arrested as Activists Target Citigroup for Bankrolling Climate Breakdown - Common Dreams
sentiment: neutral - Frontier hackers threaten to release private data for at least 750000 customers - The Verge
sentiment: neutral - Economist Harry Dent predicts stock market crash worse than 2008 crisis: The ‘bubble of all bubbles’ - Fox Business
sentiment: neutral - 2 Artificial Intelligence (AI) Semiconductor Stocks That Could Join Nvidia in the $1 Trillion Club - The Motley Fool
sentiment: neutral - Why is it so hard to get a table at Carbone in NYC? - Time Out
sentiment: neutral - Walmart condemns New York State push to require panic buttons in stores, report says - CNYcentral.com
sentiment: neutral - Volvo suddenly shifts EX30, EX90 production 