In [1]:
from newsapi import NewsApiClient
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

In [2]:
local_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(local_dir)
api_env_filepath = os.path.join(parent_dir,'api.env')
load_dotenv(Path(api_env_filepath))

newsapi = NewsApiClient(api_key=os.getenv('NEWSAPI_API_KEY'))

In [9]:
top_us_headlines = newsapi.get_top_headlines(
    country='us',
    category='business',
    language='en',
    page_size=100
)

top_titles = []
for i in top_us_headlines['articles']:
    top_titles.append(i['title'])

TypeError: expected string or bytes-like object, got 'NoneType'

In [4]:
top_titles[0:10]

NameError: name 'top_titles' is not defined

In [5]:
df = pd.read_csv('../Data/sp500.csv')

In [152]:
df_copy = df.copy()

In [153]:
df_copy.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A.O. Smith,Industrials
2,ABT,Abbott Labs,Health Care
3,ABT,Abbott,Health Care
4,ABBV,AbbVie,Health Care


In [154]:
stock_alias_set = set()

for index,row in df_copy.iterrows():
    stock_alias_set.add(row['Name'].lower())

In [159]:
text = 'Many Americans are still shying away from EVs despite Bidens push, an AP-NORC/EPIC poll finds'
kw_extractor = KeywordExtractor()

keywords = kw_extractor.extract_keywords(text)

print(keywords)
print('\n')

found_keyword = False

for k in keywords:
    cur_keyword = k[0].lower()
    if cur_keyword in stock_alias_set:
        found_keyword = True
        print(cur_keyword)

if not found_keyword:
    print('Overall Economy')

[('EPIC poll finds', 0.004390086440759893), ('Bidens push', 0.023458380875189744), ('EPIC poll', 0.026233073037508336), ('EVs despite Bidens', 0.04498862876540802), ('poll finds', 0.04940384002065631), ('EPIC', 0.08596317751626563), ('Americans', 0.1447773057422032), ('Bidens', 0.1447773057422032), ('push', 0.15831692877998726), ('AP-NORC', 0.15831692877998726), ('finds', 0.15831692877998726), ('shying', 0.29736558256021506), ('EVs', 0.29736558256021506), ('poll', 0.29736558256021506)]


Overall Economy


In [3]:
# pip install transformers
# pip install fast_ml
# pip install scipy
# pip install matplotlib
# pip install seaborn
# pip install scikit-learn
# pip install tensorflow
# pip install torch
# pip install datasets
import numpy as np
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from transformers import Trainer, TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import datasets

In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device Available: {DEVICE}')

Device Available: cuda


In [11]:
sentiment_data = pd.read_csv('../Data/headline_sentiment_data.csv',encoding="ISO-8859-1")
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [12]:
encoding_map = {'negative':0,'neutral':1,'positive':2}
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].map(encoding_map)
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [13]:
(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels) = train_valid_test_split(sentiment_data, target='Sentiment', train_size=0.8, valid_size=0.1, test_size=0.1)

train_texts = train_texts['Text'].to_list()
train_labels = train_labels.to_list()
val_texts = val_texts['Text'].to_list()
val_labels = val_labels.to_list()
test_texts = test_texts['Text'].to_list()
test_labels = test_labels.to_list()

In [14]:
class DataLoader(torch.utils.data.Dataset):
    def __init__(self, sentences=None, labels=None):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

        if bool(sentences):
            self.encodings = self.tokenizer(self.sentences,truncation=True,padding=True)

    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
        
        if self.labels == None:
            item['labels'] = None
        else:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.sentences)

    def encode(self,x):
        return self.tokenizer(x, return_tensors='pt').to(DEVICE)

In [15]:
train_dataset = DataLoader(train_texts,train_labels)
val_dataset = DataLoader(val_texts,val_labels)
test_dataset = DataLoader(test_texts,test_labels)

print(train_dataset.__getitem__(0))

{'input_ids': tensor([  101,  1996,  5096,  3976,  2001,  2025, 21362,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [16]:
f1 = datasets.load_metric('f1')
accuracy = datasets.load_metric('accuracy')
precision = datasets.load_metric('precision')
recall = datasets.load_metric('recall')

def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    
    metrics_dict.update(f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [17]:
id2label_mapping = {0:'negative',1:'neutral',2:'positive'}
label2id_mapping = {'negative':0,'neutral':1,'positive':2}

config = AutoConfig.from_pretrained('distilbert/distilbert-base-uncased',
                                    num_labels=3,
                                    id2label=id2label_mapping,
                                    label2id=label2id_mapping)

model = AutoModelForSequenceClassification.from_config(config)

In [18]:
training_args = TrainingArguments(
    output_dir='../Data',
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.05,
    report_to='none',
    evaluation_strategy='steps',
    logging_dir='../Data')

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [20]:
trainer.train()

  4%|▍         | 50/1220 [00:21<08:02,  2.42it/s]

{'loss': 1.0003, 'grad_norm': 1.7670480012893677, 'learning_rate': 5e-06, 'epoch': 0.82}


                                                 
  4%|▍         | 50/1220 [00:21<08:02,  2.42it/s]

{'eval_loss': 0.9308741688728333, 'eval_f1': 0.24345549738219896, 'eval_accuracy': 0.5752577319587628, 'eval_precision': 0.1917525773195876, 'eval_recall': 0.3333333333333333, 'eval_runtime': 0.613, 'eval_samples_per_second': 791.211, 'eval_steps_per_second': 13.051, 'epoch': 0.82}


  8%|▊         | 100/1220 [00:42<07:44,  2.41it/s]

{'loss': 0.9174, 'grad_norm': 1.5276681184768677, 'learning_rate': 1e-05, 'epoch': 1.64}


                                                  
  8%|▊         | 100/1220 [00:42<07:44,  2.41it/s]

{'eval_loss': 0.8993675112724304, 'eval_f1': 0.24345549738219896, 'eval_accuracy': 0.5752577319587628, 'eval_precision': 0.1917525773195876, 'eval_recall': 0.3333333333333333, 'eval_runtime': 0.5956, 'eval_samples_per_second': 814.252, 'eval_steps_per_second': 13.431, 'epoch': 1.64}


 12%|█▏        | 150/1220 [01:03<07:26,  2.40it/s]

{'loss': 0.8719, 'grad_norm': 5.961867332458496, 'learning_rate': 1.5e-05, 'epoch': 2.46}


                                                  
 12%|█▏        | 150/1220 [01:04<07:26,  2.40it/s]

{'eval_loss': 0.8299526572227478, 'eval_f1': 0.3966144200626959, 'eval_accuracy': 0.6247422680412371, 'eval_precision': 0.38092290459978484, 'eval_recall': 0.42246598189442963, 'eval_runtime': 0.6045, 'eval_samples_per_second': 802.267, 'eval_steps_per_second': 13.233, 'epoch': 2.46}


 16%|█▋        | 200/1220 [01:24<07:05,  2.40it/s]

{'loss': 0.8074, 'grad_norm': 2.9171369075775146, 'learning_rate': 2e-05, 'epoch': 3.28}


                                                  
 16%|█▋        | 200/1220 [01:25<07:05,  2.40it/s]

{'eval_loss': 0.8442608118057251, 'eval_f1': 0.49809288329212426, 'eval_accuracy': 0.6494845360824743, 'eval_precision': 0.5991497421018698, 'eval_recall': 0.502248320037543, 'eval_runtime': 0.6019, 'eval_samples_per_second': 805.832, 'eval_steps_per_second': 13.292, 'epoch': 3.28}


 20%|██        | 250/1220 [01:46<06:41,  2.41it/s]

{'loss': 0.726, 'grad_norm': 2.9626550674438477, 'learning_rate': 2.5e-05, 'epoch': 4.1}


                                                  
 20%|██        | 250/1220 [01:46<06:41,  2.41it/s]

{'eval_loss': 0.7470545172691345, 'eval_f1': 0.45193737614803825, 'eval_accuracy': 0.6783505154639176, 'eval_precision': 0.42696034787545645, 'eval_recall': 0.4806315299928636, 'eval_runtime': 0.6071, 'eval_samples_per_second': 798.875, 'eval_steps_per_second': 13.177, 'epoch': 4.1}


 24%|██▎       | 288/1220 [02:02<06:31,  2.38it/s]

KeyboardInterrupt: 

In [22]:
eval_results = trainer.predict(test_dataset)

100%|██████████| 8/8 [00:24<00:00,  3.07s/it]


In [23]:
print(eval_results.metrics)

{'test_loss': 1.1363548040390015, 'test_f1': 0.6646779487159359, 'test_accuracy': 0.7402061855670103, 'test_precision': 0.6903979488660985, 'test_recall': 0.6463129796463131, 'test_runtime': 28.4618, 'test_samples_per_second': 17.04, 'test_steps_per_second': 0.281}


In [59]:
trainer.save_model('../Models/sentiment_model')

In [60]:
model_path = '../Models/sentiment_model'

In [34]:
class SentimentModel():
    def __init__(self, model_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)

        args = TrainingArguments(output_dir='../Data/results', per_device_eval_batch_size=64)
        self.batch_model = Trainer(model = self.model, args=args)
        self.single_dataloader = DataLoader()

    def batch_predict_proba(self,x):
        predictions = self.batch_model.predict(DataLoader(x))
        logits = torch.from_numpy(predictions.predictions)

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim='1').to('cpu').detach().numpy()

        return proba

    def predict_proba(self,x):
        x = self.single_dataloader.encode(x).to(DEVICE)
        predictions = self.model(**x)
        logits = predictions.logits

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim='1').to('cpu').detach().numpy()

        return proba


In [8]:
top_titles[0:5]

['Why Is Shari Redstone so Upset? - Hollywood Reporter',
 "Doctors couldn't help, so they turned to unregulated home health tests - The Washington Post - The Washington Post",
 "The No. 1 benefit that keeps people happy at work, says exec in Finland: ‘It's not about the hours—it's about the results’ - CNBC",
 'Warren Buffett says this public speaking class changed his life—4 tips from the course - CNBC',
 "Super commuter touts 4-state-long commute to NYC: I'm making 'Manhattan money' with low living costs - Fox Business"]

In [14]:
new_headlines = pd.DataFrame(top_titles, columns=['headlines'])
new_headlines.head()
len(new_headlines)

63

In [16]:
batch_sentences = new_headlines.sample(n=63,random_state=1)['headlines'].to_list()
single_sentence = new_headlines.sample(n=1,random_state=1)['headlines'].to_list()[0]

In [27]:
sentiment_model = SentimentModel('../Models/sentiment_model')

In [33]:
single_sentence_probas = sentiment_model.predict_proba(single_sentence)
id2label = sentiment_model.model.config.id2label
predicted_class_label = id2label[np.argmax(single_sentence_probas)]

print(predicted_class_label)

RuntimeError: Placeholder storage has not been allocated on MPS device!