# 0. Setups

In [1]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to the current directory
from google.colab import userdata
import os
os.chdir(userdata.get('CURRENT_DIR'))

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Manually construct the list of FAANG tickers
ticker_list = ["META", "AAPL", "AMZN", "NFLX", "GOOG"]

# 1. News dataset collection w/ Polygon API

## 1.1. Directly through Polygon API

In [None]:
!pip install -U polygon-api-client

Collecting polygon-api-client
  Downloading polygon_api_client-1.14.4-py3-none-any.whl.metadata (952 bytes)
Collecting websockets<15.0,>=10.3 (from polygon-api-client)
  Downloading websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading polygon_api_client-1.14.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.9/169.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: websockets, polygon-api-client
  Attempting uninstall: websockets
    Found existing installation: websockets 15.0.1
    Uninstalling websockets-15.0.1:
      Successfully uninstalled websockets-15.0.1
Succe

In [None]:
# Get Secret Polygon API key
polygon_api = userdata.get('POLYGON_API')

In [None]:
# Create new client with API key
from polygon import RESTClient
client = RESTClient(api_key=polygon_api)

In [None]:
import time

len_ticker_list = len(ticker_list)

news_pieces = []

api_calls = 0
ticker_loop_start = time.time()
api_first_called_at = time.time()

for ticker in ticker_list:
    news_loop_start = time.time()
    rest_time = 0

    num_news_of_ticker = 0
    api_calls += 1 # A new call for each ticker

    # 1 API call == 1000 news retrieved
    for news_piece in client.list_ticker_news(
        ticker=ticker,
        limit="1000",
        order="desc",
        sort="published_utc",
        published_utc_lte="2025-03-28", # Before or equal to this date
    ):
        news_pieces.append(news_piece)
        num_news_of_ticker += 1

        # Increment api_calls every 1000 news pieces
        if (num_news_of_ticker % 1000 == 0):
            api_calls += 1

        # Call a maximum of 5000 stocks per ticker
        if num_news_of_ticker == 5000:
            break

        # When the number of API calls reached 5, let the loop rest until 1 minute
        if (api_calls == 5):
            api_calls = 0
            rest_time = 60 - (time.time() - api_first_called_at)
            print(f"API starts resting for {rest_time:.3f} seconds")
            time.sleep(rest_time)
            api_first_called_at = time.time()


    # Print ticker statistics
    print(f"Ticker {ticker} done in {time.time() - news_loop_start - rest_time:.3f} seconds")

# The entire loop should take at most a bit more than 7 min to finish
print(f"Total time elapsed: {time.time() - ticker_loop_start:.3f} seconds")

API starts resting for 58.085 seconds
Ticker META done in 2.384 seconds
API starts resting for 57.760 seconds
Ticker AAPL done in 2.749 seconds
API starts resting for 57.952 seconds
Ticker AMZN done in 2.438 seconds
API starts resting for 58.165 seconds
Ticker NFLX done in 2.010 seconds
API starts resting for 57.594 seconds
Ticker GOOG done in 3.106 seconds
Total time elapsed: 302.247 seconds


In [None]:
news_df = pd.DataFrame(news_pieces)
news_df.head()

Unnamed: 0,amp_url,article_url,author,description,id,image_url,insights,keywords,published_utc,publisher,tickers,title
0,,https://www.fool.com/investing/2025/03/27/3-re...,Prosper Junior Bakiny,Meta Platforms (META) looks attractive to buy ...,ea9f02fdd9e9cd36a97e6360d7bd93b376c930b12db8ba...,https://g.foolcdn.com/editorial/images/812053/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Meta Platforms, AI, social media, valuation]",2025-03-27T10:30:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[META, AMZN]",3 Reasons to Buy Meta Platforms Stock Hand Ove...
1,,https://www.globenewswire.com/news-release/202...,,Meta is emerging as a significant player in AI...,d45060b66f112ef1fc0c3cec93c58ddc8f85560d1092b5...,https://www.globenewswire.com/news-release/202...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Machine Learning, AI, Generative AI, FAANG, I...",2025-03-26T21:47:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,[META],FAANG Machine Learning Engineer Course 2025 - ...
2,,https://www.fool.com/investing/2025/03/26/2-ma...,Neil Rozenbaum,The article discusses Meta Platforms and Alpha...,1c17009459b9172f431e7defbe382f4697756576bd7a90...,https://g.foolcdn.com/editorial/images/812555/...,"[{'sentiment': 'neutral', 'sentiment_reasoning...","[Meta Platforms, Alphabet, financial services,...",2025-03-26T15:28:18Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[META, GOOG, GOOGL]",2 Magnificent Growth Stocks I'm Still Buying i...
3,,https://www.fool.com/investing/2025/03/26/my-t...,Manali Pradhan,The article discusses how the stock market dow...,75b24043f7acf4a5cd440e8cd1414bcfd3f0b5c4fddeb4...,https://g.foolcdn.com/editorial/images/812002/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[stock market, AI, Nvidia, Meta Platforms, Ora...",2025-03-26T14:15:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[NVDA, META, ORCL]",My Top 3 Bargain AI Stocks to Buy after the St...
4,,https://www.fool.com/investing/2025/03/24/why-...,Johnny Rice,Meta Platforms is in talks with Reliance Indus...,cc1a9bf45d9ebe2bf4524bd287dea42c0cdf9d68b8e1fb...,https://g.foolcdn.com/editorial/images/808130/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Meta Platforms, Reliance Industries, AI, India]",2025-03-24T21:40:16Z,{'favicon_url': 'https://s3.polygon.io/public/...,[META],Why Meta Platforms Stock Surged Today


From manually inspecting the data, I identified that there doesn't seem to be any piece of news with annotated sentiments before 2025-07-02. Sentiment annotation might be a newer feature of Polygon API

In [None]:
pd.to_datetime('2024-07-02').strftime("%Y-%m-%dT%H:%M:%SZ")

'2024-07-02T00:00:00Z'

In [None]:
# Find the number of rows before 2024-07-02 that contains non-null "insights"
news_df[news_df['published_utc'] < pd.to_datetime('2024-07-02').strftime("%Y-%m-%dT%H:%M:%SZ")]['insights'].notna().sum()

np.int64(0)

Therefore, the news data retrieved by the API is effectively limited to 2024-07-02 onwards

In [None]:
news_df = news_df[news_df['published_utc'] >= pd.to_datetime('2024-07-02').strftime("%Y-%m-%dT%H:%M:%SZ")]
news_df.reset_index(drop=True, inplace=True)
news_df.head()

Unnamed: 0,amp_url,article_url,author,description,id,image_url,insights,keywords,published_utc,publisher,tickers,title
0,,https://www.fool.com/investing/2025/03/27/3-re...,Prosper Junior Bakiny,Meta Platforms (META) looks attractive to buy ...,ea9f02fdd9e9cd36a97e6360d7bd93b376c930b12db8ba...,https://g.foolcdn.com/editorial/images/812053/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Meta Platforms, AI, social media, valuation]",2025-03-27T10:30:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[META, AMZN]",3 Reasons to Buy Meta Platforms Stock Hand Ove...
1,,https://www.globenewswire.com/news-release/202...,,Meta is emerging as a significant player in AI...,d45060b66f112ef1fc0c3cec93c58ddc8f85560d1092b5...,https://www.globenewswire.com/news-release/202...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Machine Learning, AI, Generative AI, FAANG, I...",2025-03-26T21:47:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,[META],FAANG Machine Learning Engineer Course 2025 - ...
2,,https://www.fool.com/investing/2025/03/26/2-ma...,Neil Rozenbaum,The article discusses Meta Platforms and Alpha...,1c17009459b9172f431e7defbe382f4697756576bd7a90...,https://g.foolcdn.com/editorial/images/812555/...,"[{'sentiment': 'neutral', 'sentiment_reasoning...","[Meta Platforms, Alphabet, financial services,...",2025-03-26T15:28:18Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[META, GOOG, GOOGL]",2 Magnificent Growth Stocks I'm Still Buying i...
3,,https://www.fool.com/investing/2025/03/26/my-t...,Manali Pradhan,The article discusses how the stock market dow...,75b24043f7acf4a5cd440e8cd1414bcfd3f0b5c4fddeb4...,https://g.foolcdn.com/editorial/images/812002/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[stock market, AI, Nvidia, Meta Platforms, Ora...",2025-03-26T14:15:00Z,{'favicon_url': 'https://s3.polygon.io/public/...,"[NVDA, META, ORCL]",My Top 3 Bargain AI Stocks to Buy after the St...
4,,https://www.fool.com/investing/2025/03/24/why-...,Johnny Rice,Meta Platforms is in talks with Reliance Indus...,cc1a9bf45d9ebe2bf4524bd287dea42c0cdf9d68b8e1fb...,https://g.foolcdn.com/editorial/images/808130/...,"[{'sentiment': 'positive', 'sentiment_reasonin...","[Meta Platforms, Reliance Industries, AI, India]",2025-03-24T21:40:16Z,{'favicon_url': 'https://s3.polygon.io/public/...,[META],Why Meta Platforms Stock Surged Today


In [None]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5278 entries, 0 to 5277
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   amp_url        468 non-null    object
 1   article_url    5278 non-null   object
 2   author         5278 non-null   object
 3   description    5278 non-null   object
 4   id             5278 non-null   object
 5   image_url      5278 non-null   object
 6   insights       5244 non-null   object
 7   keywords       5265 non-null   object
 8   published_utc  5278 non-null   object
 9   publisher      5278 non-null   object
 10  tickers        5278 non-null   object
 11  title          5278 non-null   object
dtypes: object(12)
memory usage: 494.9+ KB


Note that there are still some entries with null `insights` in the new dataset, but this constitutes a minority. I'll remove the null rows for the sentiment classification task, but keep the null rows for the stock prediction task

In [None]:
# Save the raw retrieved dataset
news_df.to_csv("data/raw/polygon_news_2024_raw.csv", index=False)

## 1.2. Through Polygon benchmark dataset for 2023

Other than the Polygon API, the team at Polygon also provided a benchmark news dataset for the year 2023 in one of their paper, available on Kaggle at: [__Financial News with Ticker-Level Sentiment__](https://www.kaggle.com/datasets/rdolphin/financial-news-with-ticker-level-sentiment)

This dataset can be used to train the sentiment classification model, but of course it cannot be used for the stock prediction task due to the huge time gap between Dec 2023 and Jul 2024

In [None]:
# Authenticate Kaggle API with my credentials (need to obtain Kaggle API first)
import kagglehub as kg
kg.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [None]:
# Download the dataset
from kagglehub import KaggleDatasetAdapter

news_df_2023 = kg.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "rdolphin/financial-news-with-ticker-level-sentiment",
    "polygon_news_sample.json",
)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rdolphin/financial-news-with-ticker-level-sentiment?dataset_version_number=1&file_name=polygon_news_sample.json...


100%|██████████| 2.16M/2.16M [00:00<00:00, 117MB/s]

Extracting zip of polygon_news_sample.json...





In [None]:
# Saved the loaded dataset
news_df_2023.to_csv("data/raw/polygon_news_2023_raw.csv", index=False)

# 2. News dataset collection w/ Alpha Vantage API

In [None]:
# Get the Secret Alpha Vantage API key
alvan_api_key = userdata.get('ALV_API_KEY')

In [None]:
# Construct general query URL
url = "https://www.alphavantage.co/query?"

In [None]:
# Construct the list of params for the API call
alv_params = {
    'function' : "NEWS_SENTIMENT",
    'time_from' : "20220101T0400",
    'time_to' : "20250328T2000", # 8PM UTC == 4PM EST i.e. closing hour, affects closing price
    'limit' : 1000,
    'apikey' : alvan_api_key,
}

In [None]:
# Construct the list of FAANG tickers
tickers = ['META', 'AMZN', 'AAPL', 'NFLX', 'GOOG']

In [None]:
import requests, urllib.parse

In [None]:
# Call the query for each ticker to retrieve the news
alv_news_data = []

for ticker in tickers:
    # Sample url: https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=AAPL&apikey=demo
    alv_news_url = f"{url}{urllib.parse.urlencode(alv_params)}&tickers={ticker}"
    alv_news_data.append(requests.get(alv_news_url).json())

In [None]:
# Construct a DataFrame from the retrieved news data
alv_news_df = pd.DataFrame(alv_news_data[0]["feed"])

for i in range(1, len(alv_news_df)):
    news_df = pd.concat([news_df, pd.DataFrame(alv_news_data[i]["feed"])])

In [None]:
news_df.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment
0,Arialief Reviews & Complaints ( ALERT ) : Is ...,https://www.benzinga.com/pressreleases/25/03/g...,20250328T193338,[Globe Newswire],"SEATTLE, March 28, 2025 ( GLOBE NEWSWIRE ) -- ...",https://ml.globenewswire.com/Resource/Download...,Benzinga,General,www.benzinga.com,"[{'topic': 'Technology', 'relevance_score': '1...",0.009994,Neutral,"[{'ticker': 'EBAY', 'relevance_score': '0.0073..."
1,"Why Digital Ad Giants Alphabet, Meta Platforms...",https://www.fool.com/investing/2025/03/28/digi...,20250328T192800,[Billy Duberstein],Shares of digital advertising giants Alphabet ...,https://media.ycharts.com/charts/f6ded1e7544cb...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.4...",-0.121428,Neutral,"[{'ticker': 'NFLX', 'relevance_score': '0.2697..."
2,EXCLUSIVE: Which Magnificent 7 Stock Will Perf...,https://www.benzinga.com/tech/25/03/44544129/e...,20250328T190544,[Chris Katje],Benzinga readers pick their favorite Magnifice...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.266216,Somewhat-Bullish,"[{'ticker': 'MSFT', 'relevance_score': '0.1984..."
3,IT'S BOBA TIME DROPS EXCLUSIVE PUBG MOBILE DRI...,https://www.benzinga.com/pressreleases/25/03/n...,20250328T180300,[PRNewswire],"LOS ANGELES, March 28, 2025 /PRNewswire/ -- Re...",https://mma.prnewswire.com/media/2513158/Its_B...,Benzinga,General,www.benzinga.com,"[{'topic': 'Technology', 'relevance_score': '1...",0.420613,Bullish,"[{'ticker': 'GOOG', 'relevance_score': '0.0832..."
4,Treace Announces Clinical Study Data Demonstra...,https://www.globenewswire.com/news-release/202...,20250328T180000,"[Inc., Treace Medical Concepts]","PONTE VEDRA, Fla., March 28, 2025 ( GLOBE NEWS...",https://ml.globenewswire.com/Resource/Download...,GlobeNewswire,,www.globenewswire.com,"[{'topic': 'Life Sciences', 'relevance_score':...",0.165204,Somewhat-Bullish,"[{'ticker': 'TMCI', 'relevance_score': '0.0792..."


In [None]:
# Save the raw retrieved data
news_df.to_csv("data/raw/alv_news_raw.csv", index=False)

# 3. Historical prices collection w/ Yahoo Finance

In [None]:
!pip install -U yfinance



In [4]:
import yfinance as yf

# Retrieve historical prices on the 1st ticker
ticker_obj = yf.Ticker("META")

# Get its history
historical_prices = ticker_obj.history(
    interval="1d",
    start="2024-12-06", #  # Impacts returns for 2024-12-09
    end="2025-03-29",
    auto_adjust=True,
    actions=False,
)

historical_prices.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-12-06 00:00:00-05:00,609.066372,628.723017,607.499085,622.713257,16935500
2024-12-09 00:00:00-05:00,622.862959,625.338798,605.14303,612.530518,11426000
2024-12-10 00:00:00-05:00,616.513785,624.53021,611.701943,618.270813,10938900
2024-12-11 00:00:00-05:00,622.313869,637.318434,620.47702,631.608093,10837200
2024-12-12 00:00:00-05:00,630.430133,635.581364,626.147422,629.721313,7474700


In [5]:
historical_prices.reset_index(inplace=True)
historical_prices["Ticker"] = "META"
historical_prices.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
0,2024-12-06 00:00:00-05:00,609.066372,628.723017,607.499085,622.713257,16935500,META
1,2024-12-09 00:00:00-05:00,622.862959,625.338798,605.14303,612.530518,11426000,META
2,2024-12-10 00:00:00-05:00,616.513785,624.53021,611.701943,618.270813,10938900,META
3,2024-12-11 00:00:00-05:00,622.313869,637.318434,620.47702,631.608093,10837200,META
4,2024-12-12 00:00:00-05:00,630.430133,635.581364,626.147422,629.721313,7474700,META


In [6]:
# Merge the rest of the tickers data with historical_prices
for i in range(1, len(ticker_list)):
    ticker_obj = yf.Ticker(ticker_list[i])

    ticker_history = ticker_obj.history(
        interval="1d",
        start="2024-12-06", # Impacts returns for 2024-12-09
        end="2025-03-29",
        auto_adjust=True,
        actions=False,
    )

    ticker_history.reset_index(inplace=True)
    ticker_history["Ticker"] = ticker_list[i]

    historical_prices = pd.concat([historical_prices, ticker_history])

historical_prices.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
0,2024-12-06 00:00:00-05:00,609.066372,628.723017,607.499085,622.713257,16935500,META
1,2024-12-09 00:00:00-05:00,622.862959,625.338798,605.14303,612.530518,11426000,META
2,2024-12-10 00:00:00-05:00,616.513785,624.53021,611.701943,618.270813,10938900,META
3,2024-12-11 00:00:00-05:00,622.313869,637.318434,620.47702,631.608093,10837200,META
4,2024-12-12 00:00:00-05:00,630.430133,635.581364,626.147422,629.721313,7474700,META


In [7]:
historical_prices["Ticker"].value_counts()

Unnamed: 0_level_0,count
Ticker,Unnamed: 1_level_1
META,76
AAPL,76
AMZN,76
NFLX,76
GOOG,76


In [8]:
# Save the price dataset
historical_prices.to_csv("data/raw/raw_prices.csv")