This notebook enriches `stock` news sources with price movement (price change during date of publish), and search hits. (how 'popular' a news article was)

In [40]:
import logging
import requests 
from datetime import timedelta, datetime
import time
import pandas as pd
import yfinance as yf 
from bs4 import BeautifulSoup
import requests
import re 
import os
from config import * 

stock = 'TSLA'

# lookup stock in file 
file = f'{STOCK_PATH}/{stock}.csv'
news = pd.read_csv(file, parse_dates=['Date'])


pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

news['Date']



0    2024-11-25 15:53:00
1    2024-11-25 10:48:00
2    2024-11-25 10:20:00
3    2024-11-26 05:36:00
4    2024-11-27 02:38:00
5    2024-11-29 02:57:00
6    2024-11-29 21:48:00
7    2024-11-30 10:12:00
8    2024-12-09 14:00:00
9    2024-12-09 21:41:00
10   2024-12-10 02:12:00
11   2024-12-10 11:50:00
12   2024-12-10 19:49:00
13   2024-12-10 22:30:00
14   2024-12-11 09:40:00
15   2024-12-11 12:19:00
16   2024-12-11 16:12:00
17   2024-12-11 17:11:00
18   2024-12-12 09:36:00
19   2024-12-12 21:50:00
20   2024-12-13 02:59:00
21   2024-12-13 06:09:00
22   2024-12-13 07:27:00
23   2024-12-13 12:02:00
24   2024-12-13 14:11:00
25   2024-12-13 15:43:00
26   2024-12-13 17:06:00
27   2024-12-16 06:07:00
28   2024-12-16 18:25:00
29   2024-12-17 11:36:00
30   2024-12-17 21:13:00
31   2024-12-18 02:15:00
32   2024-12-19 07:29:00
33   2024-12-20 05:42:00
34   2024-12-20 06:02:00
35   2024-12-20 17:24:00
36   2024-12-21 08:00:00
37   2024-12-23 08:41:00
38   2024-12-23 14:08:00
39   2024-12-27 06:05:00


In [65]:


# calculate intraday return in yfinance
def get_intraday_return(stock, oldest_date): 
    data = yf.download(stock, period = 'max', start = oldest_date - timedelta(weeks = 1))       # news from [oldest_date, today]
    data.columns = data.columns.droplevel(1)                             # drop the multi-level column index
    data.columns.name = None 

    data['Prev Close'] = data['Close'].shift(1)     # get previous close by shifting it down by 1
    data['% Change'] = ((data['Close'] - data['Prev Close']) / data['Prev Close'])  * 100
    data['% Change'] = data['% Change'].round(2)
    data.sort_values('% Change', ascending = False, inplace = True)
    return data

swings = get_intraday_return(stock, news['Date'].min())               # largest % price changes 

def lookup_date(timestamp):
    '''Looks up the price movement on a given date and populates news df'''
    try:
        return swings.loc[str(timestamp.date()), '% Change']
    except KeyError:
        return None     # print(f'No price movement data for {timestamp.date()}')

# news['% Change'] = news['Date'].apply(lookup_date)         


swings.index


news['Date'].dtype 
swings.index.date 


[*********************100%***********************]  1 of 1 completed


array([datetime.date(2025, 1, 3), datetime.date(2025, 1, 15),
       datetime.date(2024, 12, 24), datetime.date(2024, 12, 16),
       datetime.date(2024, 12, 11), datetime.date(2024, 12, 6),
       datetime.date(2024, 12, 13), datetime.date(2024, 11, 22),
       datetime.date(2024, 11, 29), datetime.date(2024, 12, 17),
       datetime.date(2024, 12, 2), datetime.date(2024, 12, 5),
       datetime.date(2025, 1, 17), datetime.date(2024, 12, 10),
       datetime.date(2024, 12, 23), datetime.date(2025, 1, 13),
       datetime.date(2024, 11, 19), datetime.date(2024, 12, 4),
       datetime.date(2024, 12, 9), datetime.date(2025, 1, 8),
       datetime.date(2025, 1, 6), datetime.date(2025, 1, 10),
       datetime.date(2024, 11, 26), datetime.date(2025, 1, 21),
       datetime.date(2025, 1, 23), datetime.date(2024, 11, 21),
       datetime.date(2024, 12, 19), datetime.date(2024, 11, 20),
       datetime.date(2025, 1, 24), datetime.date(2024, 12, 12),
       datetime.date(2024, 11, 27), datetim

In [38]:
def get_google_results(query):      # using google Custom Search JSON API
    params = {
        'q': f'"{query}"',
        'key': GOOGLE_API_KEY, 
        'cx': SEARCH_ENGINE_ID,         
    }

    try: 
        response = requests.get('https://www.googleapis.com/customsearch/v1', params = params).json() 
        if 'totalResults' not in response['queries']['request'][0]: return 0 
        num_results = int(response['queries']['request'][0]['totalResults'])
        print(query, num_results)
        return num_results
    
    except Exception as e:
        print(response)
        raise e

    
'''
Mines the top N days with the highest price change for news catalysts
'''
n = 4           # set this to a low number because of google api limits :)
largest_price_changes = news['% Change'].sort_values(key=abs, ascending=False).drop_duplicates().head(n)
news.loc[news['% Change'].isin(largest_price_changes), 'hits'] = news.loc[news['% Change'].isin(largest_price_changes), 'Title'].apply(get_google_results)




Exclusive-Trump transition team plans sweeping rollback of Biden EV, emissions policies 2220
China resident who stole Tesla trade secrets gets 2-year US sentence 1840
Tesla's China factory head Song Gang leaves company, letter to staff shows 617
As Musk gains influence, questions hover over US probes into his empire 34100
Tesla annual deliveries fall for first time as competition hurts demand 9690
Teslas Annual EV Sales Drop for First Time in Over a Decade 40
Musk donated $108 million in Tesla shares to unnamed charities, filing shows 4510
Tesla's China sales rise to record high in 2024, bucking global decline 9400
Teslas Annual EV Sales Drop for First Time in Over a Decade 40
2024 US new-car sales rise to five-year high, but Tesla and Stellantis lag 328


In [39]:
news[news['hits'] == news['hits'].max()]

Unnamed: 0,Date,Title,Link,Source,% Change,hits
46,2025-01-02 06:03:00,"As Musk gains influence, questions hover over ...",https://finance.yahoo.com/news/musk-gains-infl...,Reuters,-6.08,34100.0


In [40]:
# write back enriched news data back to csv 
news.to_csv(f'{OUTPUT_STOCK_PATH}/{stock}.csv', index = False)