In [1]:
from newsapi import NewsApiClient
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

In [2]:
local_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(local_dir)
api_env_filepath = os.path.join(parent_dir,'api.env')
load_dotenv(Path(api_env_filepath))

newsapi = NewsApiClient(api_key=os.getenv('NEWSAPI_API_KEY'))

In [3]:
top_us_headlines = newsapi.get_top_headlines(
    country='us',
    category='business',
    language='en',
    page_size=100
)

top_titles = []
for i in top_us_headlines['articles']:
    top_titles.append(i['title'])

In [4]:
top_titles[0:10]

["GM's Cruise names former Amazon, Microsoft Xbox executive as new CEO - CNBC",
 'VW will invest $5 billion in Rivian as part of new EV joint venture - The Verge',
 'Bitcoin price sees oversold bounce to $62K but will it hold? - Cointelegraph',
 "What's Going On With Walmart And Target Corp Stock On Tuesday? - Yahoo Finance",
 'Ford F-150 and Tesla Cybertruck models recalled - The Washington Post',
 'Tesla recalls thousands of Cybertrucks over windshield wiper, trunk bed trim issues - Fox Business',
 "Here's how much profit McDonald’s makes on its new $5 meal - MarketWatch",
 "Delta's new JFK luxury lounge offers fine dining, massages and showers - New York Post ",
 "A look at Perkins' rebranding efforts - Nation's Restaurant News",
 'Hooters store closings: List of 41 locations closed in 2024 - WTHR']

In [5]:
df = pd.read_csv('../Data/sp500.csv')

In [152]:
df_copy = df.copy()

In [153]:
df_copy.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A.O. Smith,Industrials
2,ABT,Abbott Labs,Health Care
3,ABT,Abbott,Health Care
4,ABBV,AbbVie,Health Care


In [154]:
stock_alias_set = set()

for index,row in df_copy.iterrows():
    stock_alias_set.add(row['Name'].lower())

In [159]:
text = 'Many Americans are still shying away from EVs despite Bidens push, an AP-NORC/EPIC poll finds'
kw_extractor = KeywordExtractor()

keywords = kw_extractor.extract_keywords(text)

print(keywords)
print('\n')

found_keyword = False

for k in keywords:
    cur_keyword = k[0].lower()
    if cur_keyword in stock_alias_set:
        found_keyword = True
        print(cur_keyword)

if not found_keyword:
    print('Overall Economy')

[('EPIC poll finds', 0.004390086440759893), ('Bidens push', 0.023458380875189744), ('EPIC poll', 0.026233073037508336), ('EVs despite Bidens', 0.04498862876540802), ('poll finds', 0.04940384002065631), ('EPIC', 0.08596317751626563), ('Americans', 0.1447773057422032), ('Bidens', 0.1447773057422032), ('push', 0.15831692877998726), ('AP-NORC', 0.15831692877998726), ('finds', 0.15831692877998726), ('shying', 0.29736558256021506), ('EVs', 0.29736558256021506), ('poll', 0.29736558256021506)]


Overall Economy


In [24]:
# pip install transformers
# pip install fast_ml
# pip install scipy
# pip install matplotlib
# pip install seaborn
# pip install scikit-learn
# pip install tensorflow
# pip install torch
# pip install datasets
import numpy as np
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from transformers import Trainer, TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import datasets
import re
import pickle

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device Available: {DEVICE}')

Device Available: cuda


In [33]:
#sentiment_data = pd.read_csv('../Data/headline_sentiment_data.csv',encoding="ISO-8859-1")
sentiment_data = pd.read_csv('../Data/SEN_en_AMT_nooutlier.csv',encoding="ISO-8859-1",index_col=0)
sentiment_data.drop(labels=['entity'],axis=1,inplace=True)
sentiment_data.rename(columns={'headline':'text',
                               'majority_label':'Sentiment'},
                               inplace=True)
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].str.lower()
sentiment_data.head()

Unnamed: 0,text,Sentiment
0,Russia and Poland Feud Over Putin Remarks on W...,negative
1,Trump Moves to Lift Visa Restrictions on Polis...,neutral
2,Trump-Backed U.S.-British Trade Deal Faces Hur...,neutral
3,Trumpâs Arrival in London Brings Controversy...,negative
4,Trump Steers Clear of War Footing Toward Iran,neutral


In [28]:
def clean_headline(headline):
    # Remove non-alphabetic characters
    headline = re.sub(r'[^A-Za-z\s]', '', headline)

    # Convert to lowercase
    headline = headline.lower()

    # Remove extra whitespace
    headline = re.sub(r'\s+', ' ', headline).strip()

    return headline

In [34]:
sentiment_data['text'] = sentiment_data['text'].apply(clean_headline)
sentiment_data.head()

Unnamed: 0,text,Sentiment
0,russia and poland feud over putin remarks on w...,negative
1,trump moves to lift visa restrictions on polis...,neutral
2,trumpbacked usbritish trade deal faces hurdles,neutral
3,trumps arrival in london brings controversy bu...,negative
4,trump steers clear of war footing toward iran,neutral


In [35]:
print(sentiment_data['Sentiment'].value_counts().get('neutral',0))
print(sentiment_data['Sentiment'].value_counts().get('positive',0))
print(sentiment_data['Sentiment'].value_counts().get('negative',0))

509
267
568


### Either sample down or sample up
- Sampling up didn't seem to work as well as intented
- The model metrics came out great but the how it scored the new data wasn't the best

In [40]:
# Because of an unbalance dataset lets upsample the positive
df_neutral = sentiment_data[sentiment_data['Sentiment'] == 'neutral']
df_positive = sentiment_data[sentiment_data['Sentiment'] == 'positive']
df_negative = sentiment_data[sentiment_data['Sentiment'] == 'negative' ]

df_neutral_upsampled = resample(df_neutral,
                                replace=True,
                                n_samples=sentiment_data['Sentiment'].value_counts().get('negative',0),
                                random_state=42)

df_positive_upsampled = resample(df_positive,
                                 replace=True,
                                 n_samples=sentiment_data['Sentiment'].value_counts().get('negative',0),
                                 random_state=42)

df_balanced = pd.concat([df_neutral_upsampled, df_positive_upsampled, df_negative])

In [7]:
# Because of an unbalance dataset lets upsample the negative and positive
df_neutral = sentiment_data[sentiment_data['Sentiment'] == 'neutral']
df_positive = sentiment_data[sentiment_data['Sentiment'] == 'positive']
df_negative = sentiment_data[sentiment_data['Sentiment'] == 'negative' ]

df_positive_upsampled = resample(df_positive,
                                 replace=True,
                                 n_samples=2800,
                                 random_state=42)

df_negative_upsampled = resample(df_negative,
                                 replace=True,
                                 n_samples=2800,
                                 random_state=42)

df_balanced = pd.concat([df_neutral, df_positive_upsampled, df_negative_upsampled])

In [41]:
print(df_balanced['Sentiment'].value_counts().get('neutral',0))
print(df_balanced['Sentiment'].value_counts().get('positive',0))
print(df_balanced['Sentiment'].value_counts().get('negative',0))

568
568
568


In [42]:
sentiment_data = df_balanced.copy()

In [43]:
encoding_map = {'negative':0,'neutral':1,'positive':2}
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].map(encoding_map)
sentiment_data.head()

Unnamed: 0,text,Sentiment
238,iowa democrats release partial caucus results ...,1
1084,mitt romney and susan collins sound reluctant ...,1
873,the finance trump claimed in the state of the ...,1
651,sanders and buttigieg in tight race for delegates,1
244,us judge denied trump adviser stones request f...,1


In [45]:
(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels) = train_valid_test_split(sentiment_data, target='Sentiment', train_size=0.8, valid_size=0.1, test_size=0.1)

train_texts = train_texts['text'].to_list()
train_labels = train_labels.to_list()
val_texts = val_texts['text'].to_list()
val_labels = val_labels.to_list()
test_texts = test_texts['text'].to_list()
test_labels = test_labels.to_list()

In [25]:
class DataLoader(torch.utils.data.Dataset):
    def __init__(self, sentences=None, labels=None):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

        if bool(sentences):
            self.encodings = self.tokenizer(self.sentences,truncation=True,padding=True)

    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
        
        if self.labels == None:
            item['labels'] = None
        else:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.sentences)

    def encode(self,x):
        return self.tokenizer(x, return_tensors='pt').to(DEVICE)

In [47]:
train_dataset = DataLoader(train_texts,train_labels)
val_dataset = DataLoader(val_texts,val_labels)
test_dataset = DataLoader(test_texts,test_labels)

print(train_dataset.__getitem__(0))

{'input_ids': tensor([  101,  2004,  8398, 16889,  2015,  4518,  3006,  8037,  2391,  2000,
         8084,  3667,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(2)}


In [48]:
f1 = datasets.load_metric('f1')
accuracy = datasets.load_metric('accuracy')
precision = datasets.load_metric('precision')
recall = datasets.load_metric('recall')

def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    
    metrics_dict.update(f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [49]:
id2label_mapping = {0:'negative',1:'neutral',2:'positive'}
label2id_mapping = {'negative':0,'neutral':1,'positive':2}

config = AutoConfig.from_pretrained('distilbert/distilbert-base-uncased',
                                    num_labels=3,
                                    id2label=id2label_mapping,
                                    label2id=label2id_mapping)

model = AutoModelForSequenceClassification.from_config(config)

In [51]:
training_args = TrainingArguments(
    output_dir='../Data',
    num_train_epochs=50,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    report_to='none',
    evaluation_strategy='steps',
    logging_dir='../Data',
    logging_steps=500)

In [52]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [53]:
trainer.train()

 45%|████▌     | 500/1100 [00:54<01:06,  9.05it/s]

{'loss': 0.5227, 'grad_norm': 3.5233986377716064, 'learning_rate': 5e-05, 'epoch': 22.73}


                                                  
 45%|████▌     | 500/1100 [00:55<01:06,  9.05it/s]

{'eval_loss': 1.5102787017822266, 'eval_f1': 0.6641928837650762, 'eval_accuracy': 0.6529411764705882, 'eval_precision': 0.6657148221158359, 'eval_recall': 0.6716273849607183, 'eval_runtime': 0.1824, 'eval_samples_per_second': 932.228, 'eval_steps_per_second': 16.451, 'epoch': 22.73}


 91%|█████████ | 1000/1100 [01:50<00:11,  9.03it/s]

{'loss': 0.0766, 'grad_norm': 0.13429714739322662, 'learning_rate': 8.333333333333334e-06, 'epoch': 45.45}


                                                   
 91%|█████████ | 1000/1100 [01:50<00:11,  9.03it/s]

{'eval_loss': 1.701771855354309, 'eval_f1': 0.7011595818185756, 'eval_accuracy': 0.6941176470588235, 'eval_precision': 0.7010613207547168, 'eval_recall': 0.709719416386083, 'eval_runtime': 0.1227, 'eval_samples_per_second': 1385.553, 'eval_steps_per_second': 24.451, 'epoch': 45.45}


100%|██████████| 1100/1100 [02:02<00:00,  8.98it/s]

{'train_runtime': 122.5452, 'train_samples_per_second': 556.121, 'train_steps_per_second': 8.976, 'train_loss': 0.27634824536063457, 'epoch': 50.0}





TrainOutput(global_step=1100, training_loss=0.27634824536063457, metrics={'train_runtime': 122.5452, 'train_samples_per_second': 556.121, 'train_steps_per_second': 8.976, 'total_flos': 599503287634200.0, 'train_loss': 0.27634824536063457, 'epoch': 50.0})

In [54]:
eval_results = trainer.predict(test_dataset)

100%|██████████| 3/3 [00:00<00:00, 33.25it/s]


In [55]:
print(eval_results.metrics)

{'test_loss': 1.2834534645080566, 'test_f1': 0.7801165979383802, 'test_accuracy': 0.7894736842105263, 'test_precision': 0.7801904027467937, 'test_recall': 0.7808167141500474, 'test_runtime': 0.1252, 'test_samples_per_second': 1365.814, 'test_steps_per_second': 23.962}


In [56]:
trainer.save_model('../Models/sentiment_model')

In [57]:
model_path = '../Models/sentiment_model'

In [58]:
class SentimentModel():
    def __init__(self, model_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)

        args = TrainingArguments(output_dir='../Data/results', per_device_eval_batch_size=64)
        self.batch_model = Trainer(model = self.model, args=args)
        self.single_dataloader = DataLoader()

    def batch_predict_proba(self,x):
        predictions = self.batch_model.predict(DataLoader(x))
        logits = torch.from_numpy(predictions.predictions)

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim=1).to('cpu').detach().numpy()

        return proba

    def predict_proba(self,x):
        x = self.single_dataloader.encode(x).to(DEVICE)
        predictions = self.model(**x)
        logits = predictions.logits

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim=1).to('cpu').detach().numpy()

        return proba

    def save_model(self, file_path):
        with open(file_path, 'wb') as file:
            pickle.dump(self, file)

    @staticmethod
    def load_model(file_path):
        with open(file_path, 'rb') as file:
            return pickle.load(file)

In [59]:
top_titles[0:5]

["GM's Cruise names former Amazon, Microsoft Xbox executive as new CEO - CNBC",
 'VW will invest $5 billion in Rivian as part of new EV joint venture - The Verge',
 'Bitcoin price sees oversold bounce to $62K but will it hold? - Cointelegraph',
 "What's Going On With Walmart And Target Corp Stock On Tuesday? - Yahoo Finance",
 'Ford F-150 and Tesla Cybertruck models recalled - The Washington Post']

In [60]:
new_headlines = pd.DataFrame(top_titles, columns=['headlines'])
new_headlines.head()
len(new_headlines)

45

In [61]:
batch_sentences = new_headlines.sample(n=len(new_headlines),random_state=1)['headlines'].to_list()
single_sentence = new_headlines.sample(n=1,random_state=1)['headlines'].to_list()[0]

In [62]:
sentiment_model = SentimentModel('../Models/sentiment_model')

In [63]:
single_sentence = clean_headline(single_sentence)
single_sentence

'whats going on with walmart and target corp stock on tuesday yahoo finance'

In [64]:
single_sentence_probas = sentiment_model.predict_proba(single_sentence)
id2label = sentiment_model.model.config.id2label
predicted_class_label = id2label[np.argmax(single_sentence_probas)]

print(predicted_class_label)

neutral


In [65]:
batch_sentence_probas = sentiment_model.batch_predict_proba(batch_sentences)
predicted_class_labels = [id2label[i] for i in np.argmax(batch_sentence_probas, axis = -1)]

100%|██████████| 1/1 [00:00<00:00, 663.76it/s]


In [66]:
for i,j in zip(batch_sentences,predicted_class_labels):
    print(f'sentiment: {j} - {i}')

sentiment: neutral - What's Going On With Walmart And Target Corp Stock On Tuesday? - Yahoo Finance
sentiment: negative - Bitcoin price sees oversold bounce to $62K but will it hold? - Cointelegraph
sentiment: negative - China's premier slams trade tensions as EV exports are hit by tariffs - The Associated Press
sentiment: negative - Elon Musk welcomes third child with Neuralink executive - Yahoo! Voices
sentiment: negative - NJ Transit, Amtrak experience delays at Penn Station New York due to disabled train - WABC-TV
sentiment: neutral - Nvidia Is No Longer the Most Valuable Company in the World. Here's What Investors Need to Know. - The Motley Fool
sentiment: negative - UAB Health System Authority to acquire Ascension St. Vincent's - University of Alabama at Birmingham
sentiment: negative - Dow closes at a one-month high; Nvidia drops - Yahoo Finance
sentiment: negative - Treasury Secretary Janet Yellen announces $100M affordable housing fund - Star Tribune
sentiment: neutral - Chipo