In [1]:
from dotenv import load_dotenv
import os
import sys

load_dotenv()
api_key = os.getenv('MY_API_KEY')

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yashvi/reliancestockandnewsdata")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dyutidasmahaptra/s-and-p-500-with-financial-news-headlines-20082024?dataset_version_number=1...


100%|██████████| 498k/498k [00:00<00:00, 1.16MB/s]

Extracting files...
Path to dataset files: C:\Users\namo\.cache\kagglehub\datasets\dyutidasmahaptra\s-and-p-500-with-financial-news-headlines-20082024\versions\1





In [34]:
# import libraries
import yfinance as yf
import pandas as pd
import json
import datetime
from datetime import date,timedelta
import warnings
import http.client, urllib.parse
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# company symbol and name
company_symbol="RELIANCE.NS"

#initialise today date
today = str(date.today())
yesterday = str(date.today()- timedelta(days = 1))

# flag variables
news_inserted=False

# keys of mediastack
mediastack_api_token = os.getenv('MEDIASTACK_API_TOKEN')

# input file paths from local kagglehub dataset
stock_history_file_path = os.path.join(path, 'reliance_stock_history.csv')
news_file_path = os.path.join(path, 'reliance_news.json')
news_sentiment_file_path = os.path.join(path, 'reliance_news_sentiment.csv')

# output file paths
output_stock_history_file_path='./reliance_stock_history.csv'
output_news_file_path='./reliance_news.json'
output_news_sentiment_file_path='./reliance_news_sentiment.csv'

# parameters for mediastack api
search_query='reliance'
conn = http.client.HTTPConnection('api.mediastack.com')
params = urllib.parse.urlencode({
    'keywords': search_query,
    'access_key': mediastack_api_token,
    'sort': 'published_desc',
    'limit': 10,
    'languages': 'en',
    'country': 'in',
    'date': yesterday
    })

In [None]:
def create_stock_history_dataset():
    reliance_stock_history = ticker_object.history(period="1d").reset_index()
    return reliance_stock_history

def daily_update_stock_history_dataset():
    reliance_stock_history = pd.read_csv(stock_history_file_path)
    reliance_stock_history['Date'] = pd.to_datetime(reliance_stock_history['Date']).dt.strftime('%Y/%m/%d')
    
    today_reliance_stock_data = ticker_object.history(period="1d")
    today_reliance_stock_data = today_reliance_stock_data.reset_index()
    today_reliance_stock_data['Date'] = pd.to_datetime(today_reliance_stock_data['Date']).dt.strftime('%Y/%m/%d')
    
    last_stock_date = str(today_reliance_stock_data.loc[0,'Date']).split()[0]

    if last_stock_date == reliance_stock_history['Date'][len(reliance_stock_history)-1]: #if already inserted
        reliance_stock_history.iloc[-1:, :] = today_reliance_stock_data.iloc[-1].tolist()
    else:
        last_position = len(reliance_stock_history)
        reliance_stock_history.loc[last_position] = today_reliance_stock_data.iloc[-1].tolist()
    return reliance_stock_history

def update_stock_history_dataset():
    reliance_stock_history = pd.read_csv(stock_history_file_path)
    reliance_stock_history['Date'] = pd.to_datetime(reliance_stock_history['Date'])

    last_recorded_date = reliance_stock_history['Date'].max()

    # Define date range to fetch
    start_date = last_recorded_date + timedelta(days=1)
    end_date = date.today()

    # Download missing data from yfinance
    new_data = yf.download(company_symbol, start=start_date, end=end_date)

    if new_data.empty:
        print("No new data to update.")
        return reliance_stock_history

    # Flatten column names if they are MultiIndex
    if isinstance(new_data.columns, pd.MultiIndex):
        new_data.columns = [col[0] for col in new_data.columns]

    # Reset index and standardize columns
    new_data.reset_index(inplace=True)
    new_data = new_data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
    new_data['Dividends'] = 0.0
    new_data['Stock Splits'] = 0.0
    new_data['Date'] = pd.to_datetime(new_data['Date'])

    # Combine and deduplicate
    updated_history = pd.concat([reliance_stock_history, new_data], ignore_index=True)
    updated_history = updated_history.drop_duplicates(subset='Date', keep='last')
    updated_history.sort_values('Date', inplace=True)

    return updated_history


# create stock market history dataset
ticker_object = yf.Ticker(company_symbol)
if os.path.exists(stock_history_file_path) == False:
    reliance_stock_history = create_stock_history_dataset()
else:
    reliance_stock_history = update_stock_history_dataset()


reliance_stock_history.to_csv(output_stock_history_file_path, index=False)
reliance_stock_history

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2021-10-21,2727.399902,2728.000000,2603.199951,2622.500000,9612230,0.0,0.0
1,2021-10-22,2620.000000,2664.899902,2611.500000,2627.399902,5086641,0.0,0.0
2,2021-10-25,2680.000000,2680.000000,2570.000000,2601.800049,7934786,0.0,0.0
3,2021-10-26,2617.100098,2668.899902,2603.149902,2661.050049,4498720,0.0,0.0
4,2021-10-27,2652.000000,2676.800049,2619.949951,2627.399902,4565815,0.0,0.0
...,...,...,...,...,...,...,...,...
881,2025-06-30,1513.800049,1524.800049,1496.000000,1500.599976,8409527,0.0,0.0
882,2025-07-01,1500.599976,1531.400024,1500.099976,1528.400024,10368523,0.0,0.0
883,2025-07-02,1528.400024,1530.000000,1508.699951,1518.800049,6361002,0.0,0.0
884,2025-07-03,1520.800049,1531.900024,1513.000000,1517.800049,11283291,0.0,0.0


# Create news dataset

In [39]:
def create_news_dataset():
    conn.request('GET', '/v1/news?{}'.format(params))
    res = conn.getresponse().read()
    reliance_news=json.loads(res.decode('utf-8'))["data"]
    return reliance_news

def update_news_dataset():
    global news_inserted
    with open(news_file_path,'r') as file:
        reliance_news=json.load(file)
        for news in reliance_news['articles']:
            if news['published_at'].split('T')[0]==yesterday:
                news_inserted=True
                break
        current_reliance_news=None
        if news_inserted==False:
            conn.request('GET', '/v1/news?{}'.format(params))
            res = conn.getresponse().read()
            current_reliance_news=json.loads(res.decode('utf-8'))["data"]
            reliance_news['articles']+=current_reliance_news
        return reliance_news['articles'],current_reliance_news

In [40]:
#create news dataset
if os.path.exists(news_file_path)==False:
    reliance_news=create_news_dataset()
    current_reliance_news=reliance_news.copy()
else:
    reliance_news,current_reliance_news=update_news_dataset()

with open(output_news_file_path,'w') as file:
    json.dump({"articles":reliance_news},file)

In [42]:
print(len(reliance_news))

661
