In [1]:
from news_scraper.article import load_articles, get_content
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import tensorflow as tf

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/abhi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
articles = load_articles("money-control-articles-formatted")
articles

[Article(url='https://www.moneycontrol.com/news/business/sterlite-power-bags-projects-worth-rs-2500-crore-in-q4-12711471.html', title='Sterlite Power bags projects worth Rs 2,500 crore in Q4', description='This takes the company's cumulative order wins in FY24 to Rs 7,000 crore, up 35 per cent year-on-year (YoY) over FY23, Sterlite Power said in a statement.Sterlite Power bags projects worth Rs 2,500 crore in Q4Related stories', timestamp='2024-04-30 13:08:00+05:30', market timestamp='2024-04-30 13:08:00+05:30', off market hours='False'),
 Article(url='https://www.moneycontrol.com/news/business/embassy-group-hospitality-firm-olive-clocks-76-revenue-growth-in-fy24-to-rs-51-cr-12744929.html', title='Embassy Group hospitality firm Olive clocks 76% revenue growth in FY24 to Rs 51 cr', description='At present, Olive has 55 co-living centres and hotel properties, signed and operational, in Bengaluru, Mumbai and Goa. There are 2,688 keys in these properties.Embassy Group hospitality firm Oliv

In [4]:
len(articles)

659

In [5]:
article_content = get_content(articles)
article_content

['sterlite power on tuesday announced securing new orders worth rs <TWO_THOUSAND> crore during the quarter ended march <THREE_TEN> this takes the companys cumulative order wins in fy24 to rs <SEVEN_THOUSAND> crore up <THREE_TEN_PERCENT> year on year yoy over fy23 sterlite power said in a statement sterlite power has capped off a successful fiscal year fy24 with new orders worth rs <TWO_THOUSAND> crore for its global products and services gps business unit in q4 this growth underscores the growing global and domestic demand for robust power transmission infrastructure particularly to support the integration of renewable energy sources critical for a net zero future the company said in q4 some of the projects secured by the company are for conductor supply to green energy power transmission projects in rajasthan including the fatehgarh bhadla transmission line project the company also bagged export orders for opgw optical ground wire in africa and cis commonwealth of independent states m

In [6]:
for article in articles:
    article.add_content_word_dict()

In [7]:
word_set = set()
for article in articles:
    word_set.update(article.word_dict.keys())
word_set

{'offering',
 'campa',
 'ntpcs',
 'levelling',
 'weekly',
 'doctored',
 'within',
 'questioning',
 'aluminum',
 'bilateral',
 'acquiring',
 'jeep',
 'coke',
 'circle',
 'identified',
 'school',
 'structural',
 'inroads',
 'ministerial',
 'bala',
 'subramanian',
 'inadequate',
 'director',
 'tighter',
 'refined',
 'loaded',
 'fenty',
 'flavouring',
 'desktop',
 'macroeconomy',
 'bharat',
 'unsoo',
 'pv',
 'forth',
 'appears',
 'ixigos',
 'inequality',
 'respected',
 'spanning',
 'institutions',
 'passed',
 'gic',
 'orbit',
 'cyrus',
 'dont',
 'cp',
 'petition',
 'zydus',
 'scaling',
 'revival',
 'chivas',
 'membrane',
 'di',
 'dow',
 'overwhelm',
 '<FIVE_PERCENT>',
 'buying',
 'binding',
 'maggi',
 'yelne',
 'decisions',
 'sovereignty',
 'growth',
 'violation',
 'troops',
 'demographic',
 'pill',
 'aiding',
 'print',
 'billions',
 'ocl',
 'investor',
 'wiring',
 'glamping',
 'del',
 'truck',
 'fortify',
 'investible',
 'abhinav',
 'lpg',
 'authoritative',
 'stainless',
 'rival',
 'exclu

In [8]:
len(word_set)

12620

In [9]:
for article in articles:
    sentiment = SentimentIntensityAnalyzer().polarity_scores(article.cleaned_content)
    article.negative_sentiment_score = sentiment.get("neg")
    article.neutral_sentiment_score = sentiment.get("neu")
    article.positive_sentiment_score = sentiment.get("pos")
    article.compound_sentiment_score = sentiment.get("compound")

In [10]:
data = {
    "timestamp": [article.market_timestamp for article in articles],
    "padded_content": [article.cleaned_content for article in articles],
    "negative_sentiment": [article.negative_sentiment_score for article in articles],
    "neutral_sentiment": [article.neutral_sentiment_score for article in articles],
    "positive_sentiment": [article.positive_sentiment_score for article in articles],
    "compound_sentiment": [article.compound_sentiment_score for article in articles],
    "off_market_hours": [1 if article.off_market_hours else 0 for article in articles]
}
base_df = pd.DataFrame(data)
# base_df = base_df.set_index("timestamp")
base_df

Unnamed: 0,timestamp,padded_content,negative_sentiment,neutral_sentiment,positive_sentiment,compound_sentiment,off_market_hours
0,2024-04-30 13:08:00+05:30,sterlite power on tuesday announced securing n...,0.014,0.867,0.119,0.9787,0
1,2024-06-10 13:51:00+05:30,olive a hospitality management startup promote...,0.008,0.929,0.064,0.9607,0
2,2024-05-28 14:22:00+05:30,lic chairman siddhartha mohanty has said that ...,0.012,0.885,0.103,0.9941,0
3,2024-05-03 09:15:00+05:30,icici bank ltd named two senior bankers to hea...,0.014,0.929,0.057,0.8519,1
4,2024-05-29 12:01:00+05:30,the largest professional services network delo...,0.016,0.840,0.144,0.9873,0
...,...,...,...,...,...,...,...
654,2024-07-24 13:43:00+05:30,adani green energy on wednesday said it has op...,0.000,0.834,0.166,0.9829,0
655,2024-02-08 10:31:00+05:30,essar oil and gas exploration production ltd i...,0.033,0.826,0.141,0.9922,0
656,2024-05-31 13:14:00+05:30,domestic passenger vehicle segment is expected...,0.014,0.808,0.178,0.9983,0
657,2024-05-13 09:15:00+05:30,,0.000,0.000,0.000,0.0000,1


In [11]:
with open("./stock list.txt", "r") as file:
    data = file.read()
stock_list = list(set(data.split()))
stock_list

['OLAELEC.NS',
 'KARURVYSYA.NS',
 'BAJAJ-AUTO.NS',
 'AXISBANK.NS',
 'BRITANNIA.NS',
 'IDFCFIRSTB.NS',
 'FEDERALBNK.NS',
 'NESTLEIND.NS',
 'BHARTIARTL.NS',
 'EICHERMOT.NS',
 'PNB.NS',
 'BAJAJFINSV.NS',
 'TATASTEEL.NS',
 'COALINDIA.NS',
 'CYIENT.NS',
 'SUNPHARMA.NS',
 'GOODLUCK.NS',
 'HINDALCO.NS',
 'IOC.NS',
 'POWERGRID.NS',
 'BAJFINANCE.NS',
 'BANKBARODA.NS',
 'AUBANK.NS',
 'LT.NS',
 'TCS.NS',
 'JSWSTEEL.NS',
 'HDFCBANK.NS',
 'SAMHI.NS',
 'TECHM.NS',
 'ICICIBANK.NS',
 'HDFC.NS',
 'INDIGO.NS',
 'BSOFT.NS',
 'ULTRACEMCO.NS',
 'PENIND.NS',
 'SHREECEM.NS',
 'ADANIPORTS.NS',
 'NTPC.NS',
 'TATAMOTORS.NS',
 'RELIANCE.NS',
 'PRAJIND.NS',
 'MARUTI.NS',
 'GRASIM.NS',
 'ZEEL.NS',
 'ZOMATO.NS',
 'BPCL.NS',
 'UJJIVANSFB.NS',
 'SBIN.NS',
 'WIPRO.NS',
 'EIHOTEL.NS',
 'HEROMOTOCO.NS',
 'CIPLA.NS',
 'INDUSTOWER.NS',
 'ONGC.NS',
 'ASIANPAINT.NS',
 'ITC.NS',
 'TITAN.NS',
 'HINDUNILVR.NS',
 'INDUSINDBK.NS',
 'ASHOKLEY.NS',
 'DRREDDY.NS',
 'GAIL.NS',
 'KOTAKBANK.NS',
 'IDBI.NS',
 'UPL.NS',
 'BANDHANBNK.NS'

In [12]:
def stock_close_values(stock: str) -> pd.Series:
    stock_data = pd.read_csv(f"./stock-prices/{stock}.csv")
    if len(stock_data) == 0:
        return pd.Series()
    
    stock_data["timestamp"] = pd.to_datetime(stock_data["Datetime"])
    stock_data = stock_data.set_index("timestamp")
    stock_close = stock_data["Close"]
    stock_close.rename(f"{stock.lower()}_close", inplace=True)
    return stock_close

In [13]:
def join_stock(base: pd.DataFrame, stock: str) -> pd.DataFrame:
    stock_close = stock_close_values(stock)
    if len(stock_close) == 0:
        return base
    return base.join(stock_close, how="inner", on="timestamp", lsuffix=stock)

In [14]:
df = base_df
for stock in stock_list:
    df = join_stock(df, stock)
df

Unnamed: 0,timestamp,padded_content,negative_sentiment,neutral_sentiment,positive_sentiment,compound_sentiment,off_market_hours,olaelec.ns_close,karurvysya.ns_close,bajaj-auto.ns_close,...,gail.ns_close,kotakbank.ns_close,idbi.ns_close,upl.ns_close,bandhanbnk.ns_close,niftyietf.ns_close,m&m.ns_close,infy.ns_close,hcltech.ns_close,vedl.ns_close
144,2024-08-20 09:15:00+05:30,the shapoorji pallonji group has set up a new ...,0.0,0.864,0.136,0.9854,1,154.759995,220.949997,9801.75,...,239.979996,1780.800049,96.290001,564.299988,193.419998,273.01001,2777.600098,1874.550049,1677.599976,444.649994
232,2024-08-13 12:26:00+05:30,after losing the manufacturing plants of benga...,0.032,0.857,0.111,0.9863,0,113.160004,214.0,9700.0,...,230.179993,1782.0,95.230003,555.25,192.649994,269.540009,2730.550049,1803.849976,1593.199951,427.649994
