---

### Import your Necessary Libraries

---

In [14]:
# Do I even need to say anything?
import pandas as pd
import numpy as np

# Plots for days
import plotly.express as px
import plotly.graph_objects as go

# Useful datetime functions
from datetime import date
from datetime import timedelta
from datetime import datetime

# Sentiment polarity analysis
from textblob import TextBlob 

# Save your models, reuse them
import pickle

# Fit, transform, all you want
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# The main powerhouse of our project
from openbb_terminal.sdk import openbb

---

### Lets get the Sentiment for your company

---

In [9]:
# Choose your company for the review
# You can list multiple companies in here if you wish to build a larger dataframe
terms = ["AAPL"]

# Create the DataFrame for our module
combined_df = pd.DataFrame()

# If there are multiple tickers it will loop through all of them and combine their news articles into 1 dataframe
for term in terms:
    # OpenBB has a .news() function which will pull in 100 recent news articles related to your Stock
    news = openbb.news(term= term, sort= "published", )
    df = pd.DataFrame({'Term': term, 'Title': news['title'], 'Date': news['published'], 'Link': news['link']})
    df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

    # Concatenate the current dataframe to the combined dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# This makes sure the date stays as DateTime for future use
combined_df['Date'] = combined_df['Date'].astype('datetime64[ns]')

# This will give us the closing price for each line from the news pull
def get_closing_price(date, term):
    end_date = date + pd.DateOffset(days=1) # Add 1 day to include the current date
    start_date = date - pd.DateOffset(days=4) # Get a little bit extra in case of weekend
    stock_data = openbb.stocks.load(symbol=term, start_date=start_date, end_date=end_date)
    if not stock_data.empty:
        closing_price = stock_data['Close'].iloc[-1] # Get the closing price for the last row
        return closing_price
    else:
        return None
    
# Update combined_df to include 'Closing_Price' column
combined_df['Closing_Price'] = combined_df.apply(lambda row: get_closing_price(row['Date'], row['Term']), axis=1)

# get_closing_price 7 days later function
def get_closing_price_7d(date, term):
    end_date = date + pd.DateOffset(days=8) # Add 7 day to include the current date
    start_date = date
    stock_data = openbb.stocks.load(symbol=term, start_date=start_date, end_date=end_date)
    if not stock_data.empty:
        closing_price_7d = stock_data['Close'].iloc[-1] # Get the closing price for the last row
        return closing_price_7d
    else:
        return None
    
# Update combined_df to include 'Closing_Price_7D' column
combined_df['Closing_Price_7D'] = combined_df.apply(lambda row: get_closing_price_7d(row['Date'], row['Term']), axis=1)

# Start Assigning Sentiment
# Create sentiment_labels list
sentiment_labels = []

# Loop through all of the text and assign labels as 'Sentiment'
for text in combined_df['Title']:
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        sentiment_labels.append('positive')
    elif sentiment < 0:
        sentiment_labels.append('negative')
    else:
        sentiment_labels.append('neutral')

combined_df['Sentiment'] = sentiment_labels

# Calculate the difference in the shift for the 7days following the day of sentiment
combined_df['Difference'] = combined_df['Closing_Price_7D'] - combined_df['Closing_Price']
combined_df

Unnamed: 0,Term,Title,Date,Link,Closing_Price,Closing_Price_7D,Sentiment,Difference
0,AAPL,Unusual Call Option Trade in Apple (AAPL) Wort...,2023-04-18,https://news.google.com/rss/articles/CBMiV2h0d...,166.470001,166.470001,positive,0.000000
1,AAPL,Recent SellOff of Apple AAPL Stock by Congress...,2023-04-18,https://news.google.com/rss/articles/CBMiRWh0d...,166.470001,166.470001,positive,0.000000
2,AAPL,Apple's Next Growth Chapter Is Here - The Motl...,2023-04-18,https://news.google.com/rss/articles/CBMiTWh0d...,166.470001,166.470001,positive,0.000000
3,AAPL,Analysts are Upbeat About Apple’s (NASDAQ:AAPL...,2023-04-18,https://news.google.com/rss/articles/CBMiYmh0d...,166.470001,166.470001,neutral,0.000000
4,AAPL,AAPL Stock Alert: How to Earn 4.15% With Apple...,2023-04-18,https://news.google.com/rss/articles/CBMiYWh0d...,166.470001,166.470001,neutral,0.000000
...,...,...,...,...,...,...,...,...
497,TSCO,Tractor Supply leans on higher pricing to offs...,2022-07-21,https://news.google.com/rss/articles/CBMiZmh0d...,198.419998,190.809998,positive,-7.610001
498,TSCO,Tractor Supply Stock: Strong Growth Momentum (...,2022-06-20,https://news.google.com/rss/articles/CBMiTmh0d...,188.729996,202.509995,positive,13.779999
499,TSCO,Tractor Supply: Making Of Successful Retail Co...,2022-04-18,https://news.google.com/rss/articles/CBMiZGh0d...,217.529999,209.960007,positive,-7.569992
500,TSCO,Tractor Supply: Great Company But Overvalued (...,2022-04-03,https://news.google.com/rss/articles/CBMiVWh0d...,226.710007,236.380005,positive,9.669998


---

### Single out the most frequent instance of 'Sentiment' by 'Date'

---

In [10]:
# Group by 'Date', 'Term', and 'Sentiment', and count occurrences
sentiment_counts = combined_df.groupby(['Date', 'Term', 'Sentiment']).size().reset_index(name='Count')

# Get the sentiment with the highest count for each group
idx = sentiment_counts.groupby(['Date', 'Term'])['Count'].idxmax()
sentiment_max_count = sentiment_counts.loc[idx, ['Date', 'Term', 'Sentiment']]

sentiment_max_count = sentiment_max_count.sort_values(by='Date', ascending=True)

In [11]:
# This will give us the closing price for each line from the news pull
def get_closing_price(date, term):
    end_date = date + pd.DateOffset(days=1) # Add 1 day to include the current date
    start_date = date - pd.DateOffset(days=3)
    stock_data = openbb.stocks.load(symbol=term, start_date=start_date, end_date=end_date)
    if not stock_data.empty:
        closing_price = stock_data['Close'].iloc[-1] # Get the closing price for the last row
        return closing_price
    else:
        return None
    
# Update combined_df to include 'Closing_Price' column
sentiment_max_count['Closing_Price'] = sentiment_max_count.apply(lambda row: get_closing_price(row['Date'], row['Term']), axis=1)

# get_closing_price 7 days later function
def get_closing_price_7d(date, term):
    end_date = date + pd.DateOffset(days=8) # Add 7 day to include the current date
    start_date = date - pd.DateOffset(days=3)
    stock_data = openbb.stocks.load(symbol=term, start_date=start_date, end_date=end_date)
    if not stock_data.empty:
        closing_price_7d = stock_data['Close'].iloc[-1] # Get the closing price for the last row
        return closing_price_7d
    else:
        return None
    
# Update combined_df to include 'Closing_Price_7D' column
sentiment_max_count['Closing_Price_7D'] = sentiment_max_count.apply(lambda row: get_closing_price_7d(row['Date'], row['Term']), axis=1)

# Calculate the difference from the 'Close_Price_7D' and 'Closing_Price' for each date
sentiment_max_count['Difference'] = sentiment_max_count['Closing_Price_7D'] - sentiment_max_count['Closing_Price']
sentiment_max_count

Unnamed: 0,Date,Term,Sentiment,Closing_Price,Closing_Price_7D,Difference
0,2021-01-16,TSCO,neutral,157.479996,155.619995,-1.860001
1,2022-04-03,TSCO,positive,226.710007,236.380005,9.669998
2,2022-04-18,TSCO,positive,217.529999,209.960007,-7.569992
3,2022-06-20,TSCO,positive,188.729996,202.509995,13.779999
4,2022-07-21,TSCO,positive,198.419998,190.809998,-7.610001
...,...,...,...,...,...,...
326,2023-04-18,GOOG,neutral,105.120003,105.120003,0.000000
329,2023-04-18,MSFT,neutral,288.369995,288.369995,0.000000
322,2023-04-18,AAPL,positive,166.470001,166.470001,0.000000
323,2023-04-18,AMZN,neutral,102.300003,102.300003,0.000000


In [17]:
# Convert date object to pandas Timestamp
end_date = pd.Timestamp(date.today()) + pd.DateOffset(days=1)
start_date = end_date - pd.DateOffset(days=31)  # 30 days ago
last_30_days_df = sentiment_max_count.loc[sentiment_max_count['Date'] >= start_date]
last_30_days_df

Unnamed: 0,Date,Term,Sentiment,Closing_Price,Closing_Price_7D,Difference
138,2023-03-19,GOOG,positive,102.459999,106.059998,3.599998
144,2023-03-20,MSFT,neutral,272.230011,276.380005,4.149994
142,2023-03-20,GOOG,negative,101.930000,103.059998,1.129997
146,2023-03-20,TSCO,neutral,232.190002,229.770004,-2.419998
139,2023-03-20,AAPL,neutral,157.399994,158.279999,0.880005
...,...,...,...,...,...,...
326,2023-04-18,GOOG,neutral,105.120003,105.120003,0.000000
329,2023-04-18,MSFT,neutral,288.369995,288.369995,0.000000
322,2023-04-18,AAPL,positive,166.470001,166.470001,0.000000
323,2023-04-18,AMZN,neutral,102.300003,102.300003,0.000000


In [18]:
fig = px.line(last_30_days_df, x='Date', y='Difference', color='Term', labels={'Difference': 'Difference'})

# Set plot title
fig.update_layout(title='Differene in Price by Stock Ticker')

fig.show()

---

### Lets Build a Model, Why Not?

---

In [12]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(combined_df['Title'], combined_df['Sentiment'], test_size=0.3, random_state=50)

# We vectorize the words into numerical features
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# And train a logistic regression model with those features
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

# Save your model for later implementation
with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Make your predictions and see how we did
y_pred = clf.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.7350993377483444
Precision: 0.5813953488372093
Recall: 0.5308857808857809
F1-score: 0.5398391812865497


In [13]:
sentiment_counts = pd.Series(y_pred).value_counts()
general_sentiment = sentiment_counts.idxmax()
print(f"There general sentiment for {terms} is:", general_sentiment)
print(sentiment_counts)

There general sentiment for ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSCO'] is: neutral
neutral     104
positive     43
negative      4
dtype: int64
