## Import libraries

In [6]:
import math
import pandas as pd
import numpy as np
import random
import yfinance as yf
import os

import requests
from bs4 import BeautifulSoup
import re
import pathlib

## Time period

In [4]:
start_date = '2020-01-01'
end_date = '2023-12-31'

## Functions for downloading

In [9]:
def load_historical_data(tickers: list, start_date: str, end_date: str, name_folder: str) -> None:
    if not os.path.exists('../' + name_folder):
        os.makedirs('../' + name_folder)

    for ticker in tickers:
        print(f"Download historical data for {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data = pd.DataFrame(data)
        data.to_csv(f'../{name_folder}/{ticker}.csv')
    print("Data loading is complete!")

def ticker_preprocessing(tickers: list, template_add: str) -> list:

    for ticker_index in range(len(tickers)):
        tickers[ticker_index] = tickers[ticker_index] + template_add

    return tickers

def get_stock_tickers(url: str, template_parsing: str) -> list:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    # response.raise_for_status()  
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)

    lines = []
    for link in soup.find_all('tr', class_='row-RdUXZpkv listRow'):
        lines.append(link)
    
    tickers = []
    for line in lines:
        html_string=str(line)
        # match = re.search(r'data-rowkey="MOEX:(.*?)"', html_string)
        match = re.search(r'data-rowkey="'+ template_parsing + r':(.*?)"', html_string)
        if match:
            tickers.append(match.group(1))
    return tickers

def get_number_stocks(folder_path: str) -> None:
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f'Number of stocks in a folder: {file_count}')

## Downloading stock market data Brazil

In [None]:
tickers = get_stock_tickers(url = "https://ru.tradingview.com/markets/stocks-brazil/market-movers-large-cap/", 
                            template_parsing = 'BMFBOVESPA')

tickers = ticker_preprocessing(tickers = tickers,
                               template_add = '.SA')

load_historical_data(tickers = tickers,
                    start_date = start_date,
                    end_date = end_date,
                    name_folder = 'Brazil')
        
get_number_stocks(folder_path = '../Brazil')

## Downloading stock market data Russia

In [41]:
tickers = get_stock_tickers(url = "https://ru.tradingview.com/markets/stocks-russia/market-movers-large-cap/", 
                            template_parsing = 'MOEX')

tickers = ticker_preprocessing(tickers = tickers,
                               template_add = '.ME')
load_historical_data(tickers = tickers,
                    start_date = start_date_,
                    end_date = end_date_,
                    name_folder = 'Russia')
        
get_number_stocks(folder_path = '../Russia')

## Downloading stock market data India

In [None]:

tickers = get_stock_tickers(url = "https://ru.tradingview.com/markets/stocks-india/market-movers-large-cap/", 
                            template_parsing = 'NSE')

tickers = ticker_preprocessing(tickers = tickers,
                               template_add = '.NS')

load_historical_data(tickers = tickers,
                    start_date = start_date,
                    end_date = end_date,
                    name_folder = 'India')
        
get_number_stocks(folder_path = '../India')

## Downloading stock market data China

In [None]:
tickers = get_stock_tickers(url = "https://ru.tradingview.com/markets/stocks-china/market-movers-large-cap/", 
                            template_parsing = 'SZSE')

tickers = ticker_preprocessing(tickers = tickers,
                               template_add = '.SZ')

load_historical_data(tickers = tickers,
                    start_date = start_date,
                    end_date = end_date,
                    name_folder = 'China')
        
get_number_stocks(folder_path = '../China')

## Downloading stock market data South Africa

In [None]:
tickers = get_stock_tickers(url = "https://ru.tradingview.com/markets/stocks-south-africa/market-movers-large-cap/", 
                            template_parsing = 'JSE')
                    
tickers = ticker_preprocessing(tickers = tickers,
                               template_add = '.JO')                            

load_historical_data(tickers = tickers,
                    start_date = start_date,
                    end_date = end_date,
                    name_folder = 'South Africa')
        
get_number_stocks(folder_path = '../South Africa')

## Downloading stock market data NASDAQ

In [None]:
tickers = get_stock_tickers(url = "https://ru.tradingview.com/symbols/NASDAQ-NDX/components/", 
                            template_parsing = 'NASDAQ')
                    
                        

load_historical_data(tickers = tickers,
                    start_date = start_date,
                    end_date = end_date,
                    name_folder = 'NASDAQ')
        
get_number_stocks(folder_path = '../NASDAQ')

## Convert Russia data stocks


In [7]:
def rename_columns_in_files(rename_dict: dict, path: pathlib.Path) -> None:
    """
    Iterates over files in the directory and renames columns based on the provided dictionary.
    
    Parameters:
    ----------
    - rename_dict (dict): A dictionary where the keys are current column names and 
                          the values are the new column names.
    - path (Path): Path object representing the directory containing the files.
    """
    for file in path.iterdir():
        if file.is_file():
            try:
                data = pd.read_csv(file)
                data.rename(columns=rename_dict, inplace=True)
                data.to_csv(file, index=False)
            except Exception as e:
                print(f"Error processing file {file.name}: {e}")
                continue


In [56]:
def clean_numeric_string(num_str: str) -> float:
    """
    Cleans a numeric string and converts it to a float.
    
    Parameters:
    ----------
    num_str : str
        The numeric string to clean and convert.
    
    Returns:
    -------
    float
        The cleaned float value.
    """
    num_str = str(num_str)
    num_str = num_str.replace(' ', '')
    num_str = num_str.replace(',', '.')
    num_str = re.sub(r'(?<=\d)\.(?=\d{3})', '', num_str)

    return float(num_str)

def rename_and_reformat_files(path: pathlib.Path) -> None:
    """
    Iterates over files in the directory, renames columns, and reformats the data.
    
    Parameters:
    ----------
    path : pathlib.Path
        The path to the directory containing the stock data files.
    """
    for file_path in path.iterdir():  # Используем iterdir для обхода по файлам
        if file_path.is_file():  # Проверяем, является ли это файлом
            try:
                data = pd.read_csv(file_path)
                data['Close'] = data['Close'].apply(clean_numeric_string)
                data['Date'] = pd.to_datetime(data['Date'], format="%d.%m.%Y").dt.strftime('%Y-%m-%d')

                data.to_csv(file_path, index=False)  # Используем file_path для сохранения

            except Exception as e:
                print(f"Error processing file {file_path.name}: {e}")
                continue

In [None]:
rename_dict = {
               'Дата': 'Date',
               'Цена': 'Close', 
               'Откр.': 'Open', 
               'Макс.': 'High',
               'Мин.': 'Low',
               'Объём': 'Volume',
               'Изм. %':'Adj Close'
}

rename_columns_in_files(rename_dict, path = pathlib.Path('../data/DataStocks/Russia'))
rename_and_reformat_files(path = pathlib.Path('../data/DataStocks/Russia'))