In [2]:
from newsapi import NewsApiClient
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd


In [3]:
local_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(local_dir)
api_env_filepath = os.path.join(parent_dir,'api.env')
load_dotenv(Path(api_env_filepath))

newsapi = NewsApiClient(api_key=os.getenv('NEWSAPI_API_KEY'))

In [4]:
top_us_headlines = newsapi.get_top_headlines(
    country='us',
    category='business',
    language='en',
    page_size=100
)

top_titles = []
for i in top_us_headlines['articles']:
    top_titles.append(i['title'])

In [5]:
top_titles[0:10]

['Why Is Shari Redstone so Upset? - Hollywood Reporter',
 "Doctors couldn't help, so they turned to unregulated home health tests - The Washington Post - The Washington Post",
 "The No. 1 benefit that keeps people happy at work, says exec in Finland: ‘It's not about the hours—it's about the results’ - CNBC",
 'Warren Buffett says this public speaking class changed his life—4 tips from the course - CNBC',
 "Super commuter touts 4-state-long commute to NYC: I'm making 'Manhattan money' with low living costs - Fox Business",
 'Forget Nvidia: 2 Artificial Intelligence (AI) Stocks to Buy Now - Yahoo Finance',
 'NVIDIA Computex 2024 Recap : Rubin GPU Architecture Unveiled Along With Huge RTX AI PC Developments - Wccftech',
 'This Week on Crypto Twitter: Roaring Kitty Shows His Face, Gets Rekt - Decrypt',
 'How to break bad money habits and achieve your goals, from a financial psychologist: First, understand how brains are wired - CNBC',
 "Amazon's 50 Hottest Summer Fashion Arrivals Include D

In [151]:
df = pd.read_csv('../Data/sp500.csv')

In [152]:
df_copy = df.copy()

In [153]:
df_copy.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A.O. Smith,Industrials
2,ABT,Abbott Labs,Health Care
3,ABT,Abbott,Health Care
4,ABBV,AbbVie,Health Care


In [154]:
stock_alias_set = set()

for index,row in df_copy.iterrows():
    stock_alias_set.add(row['Name'].lower())

In [159]:
text = 'Many Americans are still shying away from EVs despite Bidens push, an AP-NORC/EPIC poll finds'
kw_extractor = KeywordExtractor()

keywords = kw_extractor.extract_keywords(text)

print(keywords)
print('\n')

found_keyword = False

for k in keywords:
    cur_keyword = k[0].lower()
    if cur_keyword in stock_alias_set:
        found_keyword = True
        print(cur_keyword)

if not found_keyword:
    print('Overall Economy')

[('EPIC poll finds', 0.004390086440759893), ('Bidens push', 0.023458380875189744), ('EPIC poll', 0.026233073037508336), ('EVs despite Bidens', 0.04498862876540802), ('poll finds', 0.04940384002065631), ('EPIC', 0.08596317751626563), ('Americans', 0.1447773057422032), ('Bidens', 0.1447773057422032), ('push', 0.15831692877998726), ('AP-NORC', 0.15831692877998726), ('finds', 0.15831692877998726), ('shying', 0.29736558256021506), ('EVs', 0.29736558256021506), ('poll', 0.29736558256021506)]


Overall Economy


In [162]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

In [173]:
sentiment_data = pd.read_csv('../Data/headline_sentiment_data.csv',encoding="ISO-8859-1")

In [203]:
sentiment_data.head()

Unnamed: 0,Sentiment,Headline
0,0,according to gran the company has no plans to ...
1,0,technopolis plans to develop in stages an area...
2,-1,the international electronic industry company ...
3,1,with the new production plant the company woul...
4,1,according to the company s updated strategy fo...


In [175]:
def preprocess_text(text):
    text = re.sub(r'\W',' ',text) # Remove non-word chars
    text = re.sub(r'\s+',' ',text) # Remove extra spaces
    text = text.lower()
    return text

In [177]:
sentiment_data['Headline'] = sentiment_data['Headline'].apply(preprocess_text)

# Convert sentiment labels to numerical values
sentiment_mapping = {'neutral':0,'positive':1,'negative':-1}
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].map(sentiment_mapping)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    sentiment_data['Headline'], sentiment_data['Sentiment'], test_size=0.2, random_state=42
)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [179]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on test set
y_pred = model.predict(X_test_tfidf)

In [180]:
# Evaluate Model
print(f'Accuracy: {accuracy_score(y_test,y_pred)}')
print(classification_report(y_test,y_pred, target_names=['negative','neutral','positive']))

Accuracy: 0.743298969072165
              precision    recall  f1-score   support

    negative       0.82      0.43      0.56       110
     neutral       0.73      0.95      0.82       571
    positive       0.78      0.46      0.58       289

    accuracy                           0.74       970
   macro avg       0.78      0.61      0.66       970
weighted avg       0.75      0.74      0.72       970



In [199]:
def predict_sentiment(new_headline):
    # Preprocess headline
    preprocessed_headline = preprocess_text(new_headline)
    
    # Transform the sentece using the fitted TfidfVectorizer
    headline_tfidf = vectorizer.transform([preprocessed_headline])

    # Predict sentiment
    sentiment = model.predict(headline_tfidf)[0]

    # Map numerical output
    sentiment_mapping = {0:'neutral',1:'positive',-1:'negative'}

    return sentiment_mapping[sentiment]


In [1]:
headline = 'Apple stock price increases 50% overnight.'
predicted_sentiment = predict_sentiment(headline)
print(headline)
print(f'Predicted Sentiment: {predicted_sentiment}')

NameError: name 'predict_sentiment' is not defined

In [21]:
# pip install transformers
# pip install fast_ml
# pip install scipy
# pip install matplotlib
# pip install seaborn
# pip install scikit-learn
# pip install tensorflow
import numpy as np
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from transformers import Trainer, TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import datasets

2024-06-10 09:41:25.151287: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device Available: {DEVICE}')

Device Available: cpu


In [38]:
sentiment_data = pd.read_csv('../Data/headline_sentiment_data.csv',encoding="ISO-8859-1")
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [39]:
encoding_map = {'negative':0,'neutral':1,'positive':2}
sentiment_data['Sentiment'] = sentiment_data['Sentiment'].map(encoding_map)
sentiment_data.head()

Unnamed: 0,Sentiment,Text
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [40]:
(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels) = train_valid_test_split(sentiment_data, target='Sentiment', train_size=0.8, valid_size=0.1, test_size=0.1)

train_texts = train_texts['Text'].to_list()
train_labels = train_labels.to_list()
val_texts = val_texts['Text'].to_list()
val_labels = val_labels.to_list()
test_texts = test_texts['Text'].to_list()
test_labels = test_labels.to_list()

In [41]:
class DataLoader(torch.utils.data.Dataset):
    def __init__(self, sentences=None, labels=None):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

        if bool(sentences):
            self.encodings = self.tokenizer(self.sentences,truncation=True,padding=True)

    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
        
        if self.labels == None:
            item['labels'] = None
        else:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.sentences)

    def encode(self,x):
        return self.tokenizer(x, return_tensors='pt').to(DEVICE)

In [42]:
train_dataset = DataLoader(train_texts,train_labels)
val_dataset = DataLoader(val_texts,val_labels)
test_dataset = DataLoader(test_texts,test_labels)

print(train_dataset.__getitem__(0))

{'input_ids': tensor([  101,  1999,  3607,  1010, 15547, 20763,  1005,  1055,  2833,  2407,
         1005,  1055,  2188,  3006, 14082,  2035,  1996,  2126,  2000, 19163,
        20984, 16033,  2243,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [43]:
f1 = datasets.load_metric('f1')
accuracy = datasets.load_metric('accuracy')
precision = datasets.load_metric('precision')
recall = datasets.load_metric('recall')

def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    
    metrics_dict.update(f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [44]:
id2label_mapping = {0:'negative',1:'neutral',2:'positive'}
label2id_mapping = {'negative':0,'neutral':1,'positive':2}

config = AutoConfig.from_pretrained('distilbert/distilbert-base-uncased',
                                    num_labels=3,
                                    id2label=id2label_mapping,
                                    label2id=label2id_mapping)

model = AutoModelForSequenceClassification.from_config(config)

In [53]:
training_args = TrainingArguments(
    output_dir='../Data',
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.05,
    report_to='none',
    evaluation_strategy='steps',
    logging_dir='../Data',
    logging_steps=50)

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [55]:
trainer.train()

  0%|          | 0/610 [00:27<?, ?it/s]
  0%|          | 0/610 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 5.11 GB, other allocations: 1.59 GB, max allowed: 6.77 GB). Tried to allocate 112.50 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [57]:
eval_results = trainer.predict(test_dataset)

100%|██████████| 8/8 [00:05<00:00,  1.59it/s]


In [58]:
print(eval_results.metrics)

{'test_loss': 1.0112673044204712, 'test_f1': 0.609729830968769, 'test_accuracy': 0.6701030927835051, 'test_precision': 0.6633630267634162, 'test_recall': 0.5947935224753224, 'test_runtime': 6.4273, 'test_samples_per_second': 75.46, 'test_steps_per_second': 1.245}


In [59]:
trainer.save_model('../Models/sentiment_model')

In [60]:
model_path = '../Models/sentiment_model'

In [34]:
class SentimentModel():
    def __init__(self, model_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)

        args = TrainingArguments(output_dir='../Data/results', per_device_eval_batch_size=64)
        self.batch_model = Trainer(model = self.model, args=args)
        self.single_dataloader = DataLoader()

    def batch_predict_proba(self,x):
        predictions = self.batch_model.predict(DataLoader(x))
        logits = torch.from_numpy(predictions.predictions)

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim='1').to('cpu').detach().numpy()

        return proba

    def predict_proba(self,x):
        x = self.single_dataloader.encode(x).to(DEVICE)
        predictions = self.model(**x)
        logits = predictions.logits

        if DEVICE == 'cpu':
            proba = torch.nn.functional.softmax(logits,dim=1).detach().numpy()
        else:
            proba = torch.nn.functional.softmax(logits,dim='1').to('cpu').detach().numpy()

        return proba


In [8]:
top_titles[0:5]

['Why Is Shari Redstone so Upset? - Hollywood Reporter',
 "Doctors couldn't help, so they turned to unregulated home health tests - The Washington Post - The Washington Post",
 "The No. 1 benefit that keeps people happy at work, says exec in Finland: ‘It's not about the hours—it's about the results’ - CNBC",
 'Warren Buffett says this public speaking class changed his life—4 tips from the course - CNBC',
 "Super commuter touts 4-state-long commute to NYC: I'm making 'Manhattan money' with low living costs - Fox Business"]

In [14]:
new_headlines = pd.DataFrame(top_titles, columns=['headlines'])
new_headlines.head()
len(new_headlines)

63

In [16]:
batch_sentences = new_headlines.sample(n=63,random_state=1)['headlines'].to_list()
single_sentence = new_headlines.sample(n=1,random_state=1)['headlines'].to_list()[0]

In [27]:
sentiment_model = SentimentModel('../Models/sentiment_model')

In [33]:
single_sentence_probas = sentiment_model.predict_proba(single_sentence)
id2label = sentiment_model.model.config.id2label
predicted_class_label = id2label[np.argmax(single_sentence_probas)]

print(predicted_class_label)

RuntimeError: Placeholder storage has not been allocated on MPS device!