In [6]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
import time
import logging
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('market_data.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


In [7]:
class IndianMarketData:
    def __init__(self):
        # Create directories for data storage
        self.base_dir = Path('data')
        self.raw_dir = self.base_dir / 'raw'
        self.processed_dir = self.base_dir / 'processed'
        
        # Create directories if they don't exist
        for directory in [self.base_dir, self.raw_dir, self.processed_dir]:
            directory.mkdir(parents=True, exist_ok=True)
        
        # Initialize indices from Wikipedia
        self.indices = self.get_index_data()
        
    def get_index_data(self):
        """Fetch NSE index data from Wikipedia"""
        try:
            url = 'https://en.wikipedia.org/wiki/NSE_Indices'
            tables = pd.read_html(url)
            
            # Define known Yahoo Finance symbols
            known_symbols = {
                'NIFTY 50': '^NSEI',
                'NIFTY BANK': '^NSEBANK',
                'NIFTY IT': '^CNXIT',
                'NIFTY AUTO': '^0P0001PQB7',
                'NIFTY FINANCIAL SERVICES': '^CNXFINANCE',
                'NIFTY FMCG': '^CNXFMCG',
                'NIFTY METAL': '^CNXMETAL',
                'NIFTY PHARMA': '^CNXPHARMA'
            }
            
            logger.info(f"Successfully fetched index data with {len(known_symbols)} indices")
            return known_symbols
            
        except Exception as e:
            logger.error(f"Error fetching index data: {e}")
            # Fallback to default indices
            return {
                'NIFTY 50': '^NSEI',
                'NIFTY BANK': '^NSEBANK',
                'NIFTY IT': '^CNXIT',
                'NIFTY AUTO': '^0P0001PQB7'
            }

    def get_nifty50_symbols(self):
        """Get list of Nifty 50 companies with retry mechanism"""
        max_retries = 3
        retry_delay = 2
        
        for attempt in range(max_retries):
            try:
                nifty50 = pd.read_html('https://en.wikipedia.org/wiki/NIFTY_50')[1]
                print(nifty50)
                symbols = nifty50['Symbol'].tolist()
                print(symbols)
                symbols = [f"{symbol}.NS" for symbol in symbols]
                logger.info(f"Successfully fetched {len(symbols)} Nifty 50 symbols")
                return symbols
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay)
                    retry_delay *= 2
                else:
                    logger.error("All attempts to fetch Nifty 50 symbols failed")
                    return []

    def fetch_data(self, symbol, start_date, end_date, is_index=False):
        """Generic data fetching function with retry mechanism"""
        max_retries = 3
        retry_delay = 2
        
        for attempt in range(max_retries):
            try:
                if is_index:
                    df = yf.download(symbol, start=start_date, end=end_date)
                else:
                    df = yf.Ticker(symbol).history(start=start_date, end=end_date)
                
                if df is not None and not df.empty:
                    df.index = df.index.date
                    return df
                
                logger.warning(f"Empty data received for {symbol}")
                return None
                
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {symbol}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay)
                    retry_delay *= 2
                else:
                    logger.error(f"All attempts to fetch data for {symbol} failed")
                    return None

    def save_to_csv(self, df, filename):
        """Save data to CSV with error handling"""
        try:
            filepath = self.raw_dir / filename
            
            # Check if the file already exists
            if filepath.exists():
                # Append without writing the header
                df.to_csv(filepath, mode='a', header=False)
                logger.info(f"Appended data to {filepath}")
            else:
                # Write with header
                df.to_csv(filepath)
                logger.info(f"Successfully saved data to {filepath}")
            
            return True
        except Exception as e:
            logger.error(f"Error saving data to {filename}: {e}")
            return False

    def process_symbol(self, symbol, start_date, end_date, is_index=False):
        """Process a single symbol"""
        try:
            logger.info(f"Processing {'index' if is_index else 'stock'}: {symbol}")
            df = self.fetch_data(symbol, start_date, end_date, is_index)
            
            if df is not None and not df.empty:
                filename = f"{symbol.replace('^', '').replace('.NS', '')}_{'index' if is_index else 'stock'}_data.csv"
                return self.save_to_csv(df, filename)
            
            return False
            
        except Exception as e:
            logger.error(f"Error processing {symbol}: {e}")
            return False
        
    def clean_csv_file(self, filename):
        """Clean up the CSV file by removing duplicate header rows"""
        try:
            filepath = self.raw_dir / filename
            if filepath.exists():
                df = pd.read_csv(filepath)
                # Remove rows where 'Ticker' column has the ticker symbol
                df = df[df['Ticker'] != '^CNXIT']  # Adjust this based on the actual ticker symbol
                df.to_csv(filepath, index=False)
                logger.info(f"Cleaned up {filepath}")
            else:
                logger.warning(f"File {filepath} does not exist.")
        except Exception as e:
            logger.error(f"Error cleaning CSV file {filename}: {e}")

    def fetch_and_save_all_data(self, start_date, end_date, max_workers=4):
        """Main function to fetch and save all data using parallel processing"""
        successful_downloads = 0
        failed_downloads = 0
        
        # Process stocks
        stock_symbols = self.get_nifty50_symbols()
        
        # Process both stocks and indices using ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit stock processing tasks
            stock_futures = {
                executor.submit(self.process_symbol, symbol, start_date, end_date): symbol 
                for symbol in stock_symbols
            }
            
            # Submit index processing tasks
            index_futures = {
                executor.submit(self.process_symbol, symbol, start_date, end_date, True): name 
                for name, symbol in self.indices.items()
            }
            
            # Process all futures
            for future in as_completed({**stock_futures, **index_futures}):
                symbol = stock_futures.get(future) or index_futures.get(future)
                try:
                    if future.result():
                        successful_downloads += 1
                    else:
                        failed_downloads += 1
                except Exception as e:
                    logger.error(f"Error processing {symbol}: {e}")
                    failed_downloads += 1

        # Clean up index data files
        for index_name in self.indices.keys():
            self.clean_csv_file(f"{index_name.replace(' ', '_')}_index_data.csv")
        
        logger.info(f"Download complete. Successful: {successful_downloads}, Failed: {failed_downloads}")
        return successful_downloads, failed_downloads


In [8]:
def main():
    try:
        # Set date range for data
        end_date = datetime.now()
        start_date = end_date - timedelta(days=365*5)  # 5 years of data
        
        logger.info(f"Starting data collection from {start_date} to {end_date}")
        
        # Initialize and run data collection
        market_data = IndianMarketData()
        successful, failed = market_data.fetch_and_save_all_data(start_date, end_date)
        
        logger.info(f"Data collection completed. Successful: {successful}, Failed: {failed}")
        
    except Exception as e:
        logger.error(f"Fatal error in main: {e }")


In [9]:
if __name__ == "__main__":
    main()

2024-10-29 17:56:12,032 - INFO - Starting data collection from 2019-10-31 17:56:12.032513 to 2024-10-29 17:56:12.032513
2024-10-29 17:56:12,529 - INFO - Successfully fetched index data with 8 indices
2024-10-29 17:56:13,272 - INFO - Successfully fetched 50 Nifty 50 symbols
2024-10-29 17:56:13,273 - INFO - Processing stock: ADANIENT.NS
2024-10-29 17:56:13,276 - INFO - Processing stock: ADANIPORTS.NS
2024-10-29 17:56:13,276 - INFO - Processing stock: APOLLOHOSP.NS
2024-10-29 17:56:13,277 - INFO - Processing stock: ASIANPAINT.NS


                       Company name      Symbol  \
0                 Adani Enterprises    ADANIENT   
1                 Adani Ports & SEZ  ADANIPORTS   
2                  Apollo Hospitals  APOLLOHOSP   
3                      Asian Paints  ASIANPAINT   
4                         Axis Bank    AXISBANK   
5                        Bajaj Auto  BAJAJ-AUTO   
6                     Bajaj Finance  BAJFINANCE   
7                     Bajaj Finserv  BAJAJFINSV   
8                Bharat Electronics         BEL   
9                  Bharat Petroleum        BPCL   
10                    Bharti Airtel  BHARTIARTL   
11             Britannia Industries   BRITANNIA   
12                            Cipla       CIPLA   
13                       Coal India   COALINDIA   
14         Dr. Reddy's Laboratories     DRREDDY   
15                    Eicher Motors   EICHERMOT   
16                Grasim Industries      GRASIM   
17                          HCLTech     HCLTECH   
18                        HDFC 

2024-10-29 17:56:13,814 - INFO - Appended data to data\raw\ADANIENT_stock_data.csv
2024-10-29 17:56:13,820 - INFO - Processing stock: AXISBANK.NS
2024-10-29 17:56:13,870 - INFO - Appended data to data\raw\APOLLOHOSP_stock_data.csv
2024-10-29 17:56:13,870 - INFO - Appended data to data\raw\ADANIPORTS_stock_data.csv
2024-10-29 17:56:13,873 - INFO - Processing stock: BAJAJ-AUTO.NS
2024-10-29 17:56:13,874 - INFO - Processing stock: BAJFINANCE.NS
2024-10-29 17:56:13,890 - INFO - Appended data to data\raw\ASIANPAINT_stock_data.csv
2024-10-29 17:56:13,892 - INFO - Processing stock: BAJAJFINSV.NS
2024-10-29 17:56:14,028 - INFO - Appended data to data\raw\AXISBANK_stock_data.csv
2024-10-29 17:56:14,029 - INFO - Processing stock: BEL.NS
2024-10-29 17:56:14,142 - INFO - Appended data to data\raw\BAJFINANCE_stock_data.csv
2024-10-29 17:56:14,154 - INFO - Appended data to data\raw\BAJAJ-AUTO_stock_data.csv
2024-10-29 17:56:14,155 - INFO - Processing stock: BPCL.NS
2024-10-29 17:56:14,157 - INFO - P