## 🔧 Setup

In [1]:
import os
import time
import requests
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from prophet import Prophet
import nltk
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langchain_groq import ChatGroq
from bs4 import BeautifulSoup
from urllib.parse import urljoin

nltk.download('vader_lexicon')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

os.environ['GROQ_API_KEY'] = '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'
marketstack_api_key = '694d1250c8ae084e4046d6dc6fbf5f81'


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from datetime import datetime
recommendation_log = []

def log_recommendation(symbol, sentiment, forecast_direction, recommendation, forecast_date):
    recommendation_log.append({
        "Date": forecast_date,
        "Stock": symbol,
        "Sentiment": sentiment,
        "Forecast Direction": forecast_direction,
        "Recommendation": recommendation,
        "Actual Movement": None,
        "Correct": None
    })


## 📰 Article Scraper

In [3]:
def get_articles():
    article_title = []
    article_urls = []
    url = "https://www.cnbctv18.com/market/stocks/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('a', class_='jsx-95506e352219bddb story-media')
    for article in articles:
        try:
            title = article['title'].strip()
            link = urljoin(url, article['href'])
            article_title.append(title)
            article_urls.append(link)
        except:
            continue
    return article_title, article_urls


## 📈 Marketstack Fetcher

In [4]:
def get_marketstack_data(symbol, api_key, start_date, end_date):
    url = "http://api.marketstack.com/v1/eod"
    params = {
        'access_key': api_key,
        'symbols': symbol,
        'date_from': start_date,
        'date_to': end_date,
        'limit': 1000
    }

    response = requests.get(url, params=params)
    try:
        data = response.json()
    except ValueError:
        print(f"❌ JSON Error for {symbol}: {response.text[:200]}")
        return pd.DataFrame()

    if 'data' not in data or not data['data']:
        
        return pd.DataFrame()

    df = pd.DataFrame(data['data'])
    df['date'] = pd.to_datetime(df['date'])
    df.rename(columns={
        'date': 'Date',
        'open': 'Open',
        'close': 'Close',
        'high': 'High',
        'low': 'Low',
        'volume': 'Volume'
    }, inplace=True)
    return df[['Date', 'Open', 'Close', 'High', 'Low', 'Volume']]










In [5]:
def get_stock_data(symbol, start_date, end_date):
    # First try MarketStack
    df = get_marketstack_data(symbol, marketstack_api_key, start_date, end_date)


# append XNSE
    if df.empty:
        symbol=symbol+".XNSE"
        df = get_marketstack_data(symbol, marketstack_api_key, start_date, end_date)
        
    return df

   

In [6]:
def get_forecast_direction(forecast_values):
    if not forecast_values or len(forecast_values) < 2:
        return "flat"
    if forecast_values[-1] > forecast_values[0]:
        return "up"
    elif forecast_values[-1] < forecast_values[0]:
        return "down"
    else:
        return "flat"

def get_recommendation(sentiment, forecast_direction):
    if sentiment == "positive" and forecast_direction == "up":
        return "✅ This article is positive. Based on forecasts and sentiment, consider watching or buying."
    elif sentiment == "negative" and forecast_direction == "down":
        return "❌ This article is negative. Based on forecasts and sentiment, consider avoiding this stock."
    else:
        return "⚠️ The signals are mixed. You may consider waiting before making a decision."


## 🔮 Forecasting Function

In [None]:
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet

def forecast_stock(symbol, n_future_days=30, export=True):
    """
    Forecast the next `n_future_days` of stock prices using Facebook Prophet.

    Args:
        symbol (str): Ticker symbol of the stock.
        n_future_days (int): Number of days to forecast beyond current date.
    
    Returns:
        tuple: (forecasted yhat values list, forecast DataFrame)
    """
    start_date = "2015-01-01"
    end_date = datetime.now().strftime("%Y-%m-%d")
    
    print(f"\n📈 Forecasting next {n_future_days} days for: {symbol}")
    df = get_stock_data(symbol, start_date, end_date)

    if df.empty:
        print(f"⚠️ No historical data found for {symbol}. Skipping.")
        return [], None

    if len(df) < 100:
        print(f"⚠️ Not enough data points for {symbol} ({len(df)} rows). Skipping.")
        return [], None

    try:
        # Sort and feature engineering
        df.sort_values('Date', inplace=True)
        df['MA_20'] = df['Close'].rolling(window=20).mean()
        for i in range(1, 8):
            df[f'Close_lag_{i}'] = df['Close'].shift(i)
        df.dropna(inplace=True)

        # Prepare for Prophet
        df['ds'] = pd.to_datetime(df['Date']).dt.tz_localize(None)
        df.rename(columns={'Close': 'y'}, inplace=True)

        # Fit model
        model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
        model.add_regressor('Volume')
        model.add_regressor('MA_20')
        for i in range(1, 8):
            model.add_regressor(f'Close_lag_{i}')
        model.fit(df)

        # Create future DataFrame
        future = model.make_future_dataframe(periods=n_future_days)
        
        # Merge regressors
        future = pd.merge(future, df.drop(columns=['y']), on='ds', how='left')

        # Forward-fill regressors for future dates
        future.fillna(method='ffill', inplace=True)

        # Forecast
        forecast = model.predict(future)

        # Plot results
        plt.figure(figsize=(14, 6))
        plt.plot(df['ds'], df['y'], label='Historical', linewidth=2)
        plt.plot(forecast['ds'], forecast['yhat'], label='Forecast', linestyle='--', linewidth=2)
        plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='skyblue', alpha=0.3)
        plt.axvline(x=df['ds'].max(), color='gray', linestyle='--', label='Forecast Start')
        plt.title(f"{symbol} - Forecast Next {n_future_days} Days")
        plt.xlabel("Date")
        plt.ylabel("Close Price")
        plt.legend()
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        forecast_future = forecast.tail(n_future_days)[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
        forecast_future.columns = ['Date', 'Forecast', 'Lower Bound', 'Upper Bound']
        forecast_future['Date'] = forecast_future['Date'].dt.strftime('%Y-%m-%d')

        # Show table
        display(forecast_future)

        # Export to CSV
        if export:
            filename = f"forecast_{symbol}_{n_future_days}_days.csv"
            forecast_future.to_csv(filename, index=False)
            print(f"📁 Forecast saved to: {os.path.abspath(filename)}")

        return forecast_future['Forecast'].tolist(), forecast_future

    except Exception as e:
        print(f"❌ Forecasting failed for {symbol}: {e}")
        return [], None


## 🤖 Analyze Article

In [14]:
def analyze_article(url, symbols_df):
    import re
    from langchain.document_loaders import UnstructuredURLLoader
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from difflib import get_close_matches

    loader = UnstructuredURLLoader(urls=[url])
    data = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = splitter.split_documents(data)
    docs = [doc for doc in docs if len(doc.page_content.strip()) > 100]
    text = docs[0].page_content[:1000] if docs else ""

    llm = ChatGroq(model="llama3-8b-8192", temperature=0.7, max_tokens=512)
    
    # Summarize and get sentiment
    summary = llm.invoke(f"Summarize this article:\n{text}").content
    sentiment = 'negative' if analyzer.polarity_scores(summary)['compound'] < 0 else 'positive'

    # Get company names from LLM instead of SpaCy NER
    company_prompt = f"List publicly traded companies mentioned in this article:\n{text}"
    companies_text = llm.invoke(company_prompt).content
    raw_names = [line.strip().upper() for line in companies_text.split('\n') if line.strip()]

    # Clean names
    cleaned_names = []
    for name in raw_names:
        name = re.sub(r'[^A-Z &]', '', name)  # remove numbers, punctuation
        name = name.strip()
        if len(name) > 3:
            cleaned_names.append(name)

    stock_names = list(set(cleaned_names))

    # Generate variations
    variations = []
    for name in stock_names:
        if '&' in name:
            variations.append(name.replace('&', 'AND'))
        elif 'AND' in name:
            variations.append(name.replace('AND', '&'))
        if 'LTD' in name:
            variations.append(name.replace('LTD', 'LIMITED'))
        elif 'LIMITED' in name:
            variations.append(name.replace('LIMITED', 'LTD'))
    stock_names.extend(variations)
    stock_names = list(set(stock_names))

    # Match to symbols
    symbols_df['Name'] = symbols_df['Name'].str.upper()
    matched = []

    for name in stock_names:
        exact = symbols_df[symbols_df['Name'] == name]['Symbol'].tolist()
        if exact:
            matched.extend(exact)
        else:
            try:
                partial = symbols_df[symbols_df['Name'].str.contains(re.escape(name), na=False, regex=True)]['Symbol'].tolist()
                if partial:
                    matched.extend(partial)
                else:
                    close_names = get_close_matches(name, symbols_df['Name'], n=1, cutoff=0.85)
                    if close_names:
                        symbol = symbols_df[symbols_df['Name'] == close_names[0]]['Symbol'].values[0]
                        matched.append(symbol)
            except Exception as e:
                print(f"Regex error for name: {name} → {e}")

    matched = list(set(matched))  # Remove duplicates

    if not matched:
        print(f"⚠️ No symbols matched for these names: {stock_names}")

    return summary, sentiment, stock_names, matched


## 🚀 Run Pipeline

In [16]:
symbols_df = pd.read_csv('merged_cleaned stock&symbols.csv')[['Name', 'Symbol']]
symbols_df = symbols_df.dropna(subset=["Symbol"])
symbols_df = symbols_df[symbols_df["Symbol"] != "#REF!"]

titles, urls = get_articles()
for title, url in zip(titles, urls[:3]):
    print(f"\nProcessing article: {title}\n{url}")
    try:
        summary, sentiment, stock_names, matched_symbols = analyze_article(url, symbols_df)
        print(f"Summary: {summary}")
        print(f"Sentiment: {sentiment}")
        print(f"Companies: {stock_names}")
        print(f"Symbols: {matched_symbols}")

        for symbol in matched_symbols:
            print(f"\nForecasting for: {symbol}")
            forecast_values, forecast_date = forecast_stock(symbol)
            forecast_direction = get_forecast_direction(forecast_values)
            recommendation = get_recommendation(sentiment, forecast_direction)
            print(f"Recommendation: {recommendation}")

            if forecast_date:
                log_recommendation(symbol, sentiment, forecast_direction, recommendation, forecast_date)


            time.sleep(15)
    except Exception as e:
        print(f"Error processing article: {e}")



Processing article: Why financials attracted strong FPI buying in March
https://www.cnbctv18.com/market/stocks/financials-services-strong-fpi-buying-in-march-numbers-reason-sector-19585041.htm
⚠️ No symbols matched for these names: ['THERE ARE NO PUBLICLY TRADED COMPANIES MENTIONED IN THIS ARTICLE THE ARTICLE DISCUSSES FOREIGN PORTFOLIO INVESTORS FPI BUYING FINANCIAL SERVICES BUT DOES NOT MENTION SPECIFIC COMPANIES']
Summary: The article reports that foreign portfolio investors (FPIs) showed strong buying interest in financial services stocks in March, with investments of over $2 billion in the second half of the month. This rebounded from an outflow of $380 million in the first half of the month. Financial services accounted for nearly a third of FPI's total investments in March.
Sentiment: positive
Companies: ['THERE ARE NO PUBLICLY TRADED COMPANIES MENTIONED IN THIS ARTICLE THE ARTICLE DISCUSSES FOREIGN PORTFOLIO INVESTORS FPI BUYING FINANCIAL SERVICES BUT DOES NOT MENTION SPECIFIC

03:12:44 - cmdstanpy - INFO - Chain [1] start processing
03:12:45 - cmdstanpy - INFO - Chain [1] done processing


Recommendation: ⚠️ The signals are mixed. You may consider waiting before making a decision.
Error processing article: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Processing article: VIX closes at highest since 2020 as stock slide accelerates
https://www.cnbctv18.com/market/vix-closes-at-highest-since-2020-as-stock-slide-accelerates-19584987.htm
⚠️ No symbols matched for these names: ['THERE ARE NO PUBLICLY TRADED COMPANIES MENTIONED IN THIS ARTICLE THE ARTICLE APPEARS TO BE DISCUSSING THE CBOE VOLATILITY INDEX VIX AND ITS RELATION TO THE S&P  STOCK MARKET INDEX BUT IT DOES NOT MENTION ANY SPECIFIC PUBLICLY TRADED COMPANIES', 'THERE ARE NO PUBLICLY TRADED COMPANIES MENTIONED IN THIS ARTICLE THE ARTICLE APPEARS TO BE DISCUSSING THE CBOE VOLATILITY INDEX VIX AND ITS RELATION TO THE SANDP  STOCK MARKET INDEX BUT IT DOES NOT MENTION ANY SPECIFIC PUBLICLY TRADED COMPANIES']
Summary: The article reports that the Cboe Volatility Index (VI