In [None]:
!pip install newsapi-python holidays

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [None]:
import pandas as pd
from datetime import datetime, timedelta
from newsapi import NewsApiClient
import os
import holidays

In [None]:
def fetch_news(keyword, start_date, end_date):
    """
    Fetch news for the given keyword in title and date range.
    """
    articles = newsapi.get_everything(
        # q=keyword,
        qintitle=keyword,
        from_param=start_date,
        to=end_date,
        language='en',
        sort_by='relevancy',
        page_size=100
    )

    # Parse the articles
    news_list = []
    for article in articles['articles']:
        news_list.append({
            'Date': article['publishedAt'][:10],  # Extract date
            'Title': article['title'],
            'Description': article['description'],
            'URL': article['url']
        })

    return pd.DataFrame(news_list)

In [None]:
ticker_keyword = [
    ("SBIN.NS","sbi"),("HDFCBANK.NS","hdfc"),("RELIANCE.NS","reliance"),("ADANIENT.NS","adani"),("ZOMATO.NS","zomato"),("DMART.NS","dmart"),("IRCTC.NS","irctc"),("ITC.NS","itc"),("TECHM.NS","techm"),("NTPCGREEN.NS","ntpc green"),("TCS.NS","tcs"),("LICI.NS","lic"),("VEDL.NS","vedanta"),("HYUNDAI.NS","hyundai"),("ONGC.NS","ongc"),("TITAN.NS","titan"),("ASIANPAINT.NS","asian paints"),("AUROPHARMA.NS","aurobindo pharma")
,("LUPIN.NS","lupin"),
 ("PAYTM.NS","paytm") ,("INFY.NS","infosys"),("TATAPOWER.NS","tata power"),("STARHEALTH.NS","star health"),("COCHINSHIP.NS","cochin shipyard")]

In [None]:
stock_news_dir = "stock_news_data/news_api"
os.makedirs(stock_news_dir, exist_ok=True)

In [None]:
count = 1
for ticker, keyword in ticker_keyword:
    print(f"{count} Ticker: {ticker}, Keyword: {keyword}")
    start_date = "2024-11-29"
    end_date = "2024-12-29"

    # Fetch news
    news_data = fetch_news(keyword, start_date, end_date)

    # Save to CSV
    news_data.to_csv(f"{stock_news_dir}/{ticker}_news_data.csv", index=False)
    print(f"\nNews data saved to CSV files for {ticker}.")
    count += 1

In [None]:
news_data = pd.DataFrame()

for ticker, keyword in ticker_keyword:
  print(f"Loading files for {ticker}")
  news_data_this = pd.read_csv(f"{stock_news_dir}/{ticker}_news_data.csv", parse_dates=["Date"])
  news_data_this["Stock_Name"] = ticker
  news_data = pd.concat([news_data, news_data_this])
  print(f"Completed for {ticker}")

news_data.to_csv("news_data_raw.csv", index=False)

In [None]:
news_data = pd.read_csv("news_data_raw.csv", parse_dates=["Date"])

In [None]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1917 entries, 0 to 1916
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         1917 non-null   datetime64[ns]
 1   Title        1917 non-null   object        
 2   Description  1864 non-null   object        
 3   URL          1917 non-null   object        
 4   Stock_Name   1917 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 75.0+ KB


In [None]:
news_data.head(10)

Unnamed: 0,Date,Title,Description,URL,Stock_Name
0,2024-12-02,"Japanese exchange DMM Bitcoin to shut down, tr...",The Japanese crypto exchange announced plans t...,https://biztoc.com/x/fd9935b88d2b2a1b,SBIN.NS
1,2024-11-30,SBI Cards: Time to dis’card’?,The stock of SBI Cards and Payment Services ha...,https://www.thehindubusinessline.com/portfolio...,SBIN.NS
2,2024-12-02,SBI customers: How to activate an inoperative ...,Inoperative SBI bank account: The bank emphasi...,https://economictimes.indiatimes.com/wealth/sa...,SBIN.NS
3,2024-12-05,SBI Chief Setty wants branch staff to “ask for...,This comes amid intense competition for savers...,https://www.thehindubusinessline.com/money-and...,SBIN.NS
4,2024-12-03,SBI Shinsei Bank Raises Interest Rate to 0.3% ...,"Starting in December, SBI Shinsei Bank has rai...",https://newsonjapan.com/article/144308.php,SBIN.NS
5,2024-12-14,"SBI MCLR from December 15, 2024: Check latest ...",SBI lending rates: The State Bank of India (SB...,https://economictimes.indiatimes.com/wealth/bo...,SBIN.NS
6,2024-12-06,READYFORと株式会社SBI証券が業務提携を開始,[READYFOR株式会社]\n[画像: https://prcdn.freetls.fas...,https://prtimes.jp/main/html/rd/p/000000386.00...,SBIN.NS
7,2024-12-08,SBI Research anticipates RBI repo rate cut in ...,SBI Research believes that GDP growth for 2024...,https://www.thehindubusinessline.com/money-and...,SBIN.NS
8,2024-12-02,Customers parking more funds in term deposits ...,"SBI report shows rise in term deposits, surpas...",https://www.thehindubusinessline.com/money-and...,SBIN.NS
9,2024-12-16,"Indian firms see revenue surge in FY24, but em...","SBI report: 4,000 Indian companies see 6% reve...",https://www.thehindubusinessline.com/companies...,SBIN.NS


In [None]:
news_data["Date_Formated"] = pd.to_datetime(news_data["Date"].dt.date)
news_data["Title"] = news_data["Title"].str.lower().str.replace(r"[^\w\s]", "", regex=True)
news_data["Description"] = news_data["Description"].str.lower().str.replace(r"[^\w\s]", "", regex=True)

In [None]:
import numpy as np
indian_holidays = holidays.India(years=[2024]).keys()
indian_business_day = pd.offsets.CustomBusinessDay(holidays=list(indian_holidays))
# Convert the date column to datetime64[ns] using NumPy
dfholidays['date_as_timestamp'] = dfholidays['date'].apply(lambda x: np.datetime64(x))

# Create a set of holiday dates using NumPy's datetime64
indian_holidays_set = set(dfholidays['date_as_timestamp'])

# Function to get the next business day in India
def next_indian_business_day(date):

    if(date in indian_holidays_set):
      next_day = pd.date_range(date + pd.Timedelta(days=1), periods=1, freq=indian_business_day)[0]
      return next_day
    else:
      return date

In [None]:
news_data["Date_Formated"].unique()

<DatetimeArray>
['2024-12-02 00:00:00', '2024-11-30 00:00:00', '2024-12-05 00:00:00',
 '2024-12-03 00:00:00', '2024-12-14 00:00:00', '2024-12-06 00:00:00',
 '2024-12-08 00:00:00', '2024-12-16 00:00:00', '2024-12-11 00:00:00',
 '2024-12-13 00:00:00', '2024-12-19 00:00:00', '2024-12-04 00:00:00',
 '2024-12-15 00:00:00', '2024-12-18 00:00:00', '2024-12-20 00:00:00',
 '2024-12-24 00:00:00', '2024-12-23 00:00:00', '2024-12-27 00:00:00',
 '2024-12-26 00:00:00', '2024-12-12 00:00:00', '2024-12-07 00:00:00',
 '2024-12-17 00:00:00', '2024-12-09 00:00:00', '2024-11-29 00:00:00',
 '2024-12-21 00:00:00', '2024-12-25 00:00:00', '2024-11-27 00:00:00',
 '2024-11-28 00:00:00', '2024-12-10 00:00:00', '2024-12-22 00:00:00',
 '2024-12-01 00:00:00']
Length: 31, dtype: datetime64[ns]

In [None]:
news_data["Adjusted_Date"] = news_data["Date_Formated"].apply(
    lambda x: next_indian_business_day(x)
)

In [None]:
news_data["Adjusted_Date"].unique()

<DatetimeArray>
['2024-12-02 00:00:00', '2024-11-30 00:00:00', '2024-12-05 00:00:00',
 '2024-12-03 00:00:00', '2024-12-14 00:00:00', '2024-12-06 00:00:00',
 '2024-12-08 00:00:00', '2024-12-16 00:00:00', '2024-12-11 00:00:00',
 '2024-12-13 00:00:00', '2024-12-19 00:00:00', '2024-12-04 00:00:00',
 '2024-12-15 00:00:00', '2024-12-18 00:00:00', '2024-12-20 00:00:00',
 '2024-12-24 00:00:00', '2024-12-23 00:00:00', '2024-12-27 00:00:00',
 '2024-12-26 00:00:00', '2024-12-12 00:00:00', '2024-12-07 00:00:00',
 '2024-12-17 00:00:00', '2024-12-09 00:00:00', '2024-11-29 00:00:00',
 '2024-12-21 00:00:00', '2024-11-27 00:00:00', '2024-11-28 00:00:00',
 '2024-12-10 00:00:00', '2024-12-22 00:00:00', '2024-12-01 00:00:00']
Length: 30, dtype: datetime64[ns]

In [None]:
news_data.drop_duplicates(subset = [ "Stock_Name","Title"],
                     keep = 'first', inplace = True)

In [None]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1806 entries, 0 to 1916
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1806 non-null   datetime64[ns]
 1   Title          1806 non-null   object        
 2   Description    1753 non-null   object        
 3   URL            1806 non-null   object        
 4   Stock_Name     1806 non-null   object        
 5   Date_Formated  1806 non-null   datetime64[ns]
 6   Adjusted_Date  1806 non-null   datetime64[ns]
dtypes: datetime64[ns](3), object(4)
memory usage: 112.9+ KB


In [None]:
# Append ---newarticle--- to split for NLP
news_data['full_news'] = ' ---title--- ' + news_data['Title'].astype(str) + ' ---body--- ' + news_data['Description'].astype(str) + ' ---newarticle--- '

In [None]:
news_data_combined_same_date = news_data.copy()

In [None]:
news_data.set_index('Adjusted_Date', inplace=True)
news_data = news_data.sort_index()

news_data_combined_same_date.set_index('Adjusted_Date', inplace=True)
news_data_combined_same_date.sort_index()

Unnamed: 0_level_0,Date,Title,Description,URL,Stock_Name,Date_Formated,full_news
Adjusted_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-11-27,2024-11-27,vedanta stock price 039 per cent as sensex,as of 30sep2024 promoters held 00 per cent sta...,https://economictimes.indiatimes.com/markets/s...,LUPIN.NS,2024-11-27,---title--- vedanta stock price 039 per cent ...
2024-11-27,2024-11-27,ntpc green energys mcap crosses rs 1 lakh cror...,the issue open for subscription from november ...,https://economictimes.indiatimes.com/markets/s...,LUPIN.NS,2024-11-27,---title--- ntpc green energys mcap crosses r...
2024-11-27,2024-11-27,ntpc green energy shares rally 10 after premiu...,ntpc green energy shares were listed at 11150 ...,https://www.thehindubusinessline.com/markets/s...,LUPIN.NS,2024-11-27,---title--- ntpc green energy shares rally 10...
2024-11-27,2024-11-27,ntpc green energy shares jump 10 post listing ...,shares of ntpc green energy rose 10 to rs 1227...,https://economictimes.indiatimes.com/markets/s...,LUPIN.NS,2024-11-27,---title--- ntpc green energy shares jump 10 ...
2024-11-27,2024-11-27,ntpc green energy signs mou with chhattisgarh ...,ntpc green energy limited ngel has signed a me...,https://economictimes.indiatimes.com/industry/...,LUPIN.NS,2024-11-27,---title--- ntpc green energy signs mou with ...
...,...,...,...,...,...,...,...
2024-12-27,2024-12-27,hdfc amc share price 053 per cent,a total of 431 shares changed hands on the cou...,https://economictimes.indiatimes.com/markets/s...,HDFCBANK.NS,2024-12-27,---title--- hdfc amc share price 053 per cent...
2024-12-27,2024-12-27,billiondollar bruises in 2024 reliance industr...,in 2024 while junior nifty50 and smallcaps del...,https://economictimes.indiatimes.com/markets/s...,RELIANCE.NS,2024-12-27,---title--- billiondollar bruises in 2024 rel...
2024-12-27,2024-12-27,hyundai ioniq 5 ev sets guinness world record ...,a hyundai ioniq 5 ev has set a guinness world ...,https://paultan.org/2024/12/27/hyundai-ioniq-5...,LUPIN.NS,2024-12-27,---title--- hyundai ioniq 5 ev sets guinness ...
2024-12-27,2024-12-27,hdfc credit card rewards points how to redeem ...,hdfc credit card reward points offer benefits ...,https://www.livemint.com/money/personal-financ...,LUPIN.NS,2024-12-27,---title--- hdfc credit card rewards points h...


In [None]:
news_data_combined_same_date.reset_index(inplace=True)
news_data_combined_same_date['news_combined'] = news_data_combined_same_date.groupby(['Stock_Name','Adjusted_Date'])['full_news'].transform(lambda x: ' '.join(x))


In [None]:
news_data_combined_same_date.drop_duplicates(subset =['Stock_Name','Adjusted_Date'],
                     keep = 'first', inplace = True)

In [None]:
news_data_combined_same_date.info()

<class 'pandas.core.frame.DataFrame'>
Index: 402 entries, 0 to 1804
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Adjusted_Date  402 non-null    datetime64[ns]
 1   Date           402 non-null    datetime64[ns]
 2   Title          402 non-null    object        
 3   Description    387 non-null    object        
 4   URL            402 non-null    object        
 5   Stock_Name     402 non-null    object        
 6   Date_Formated  402 non-null    datetime64[ns]
 7   full_news      402 non-null    object        
 8   news_combined  402 non-null    object        
dtypes: datetime64[ns](3), object(6)
memory usage: 31.4+ KB


In [None]:
news_data_combined_same_date.to_csv("news_data_combined_same_date_1.csv", index=False)
news_data.to_csv("news_data_sentiment_1.csv", index=False)