In [2]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from threading import Thread


def format_date(date):
    return date.strftime("%m/%d/%Y")


def init_directory():
    if not os.path.exists('./data'):
        os.makedirs('./data')


BASE_URL = 'https://www.mse.mk/en/stats/symbolhistory/'
init_directory()


def fetch_companies():
    sample_url = 'https://www.mse.mk/en/stats/symbolhistory/KMB'
    response = requests.get(sample_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    company_names = soup.select('.form-control option')
    
    valid_company_names = [name.text for name in company_names if not any(char.isdigit() for char in name.text)]
    return valid_company_names


companies = fetch_companies()


def parse_cells(row):
    cells = row.find_all('td')
    
    def format_price(value):
        try:
            return "{:,.2f}".format(float(value.replace(',', '.'))).replace(',', 'X').replace('.', ',').replace('X', '.')
        except ValueError:
            return value
    
    item = {
        'Date': cells[0].text,
        'Last trade price': format_price(cells[1].text),
        'Max': format_price(cells[2].text),
        'Min': format_price(cells[3].text),
        'Avg Price': format_price(cells[4].text),
        '%chg.': cells[5].text.replace('.', ','),
        'Volume': cells[6].text,
        'TurnoverBEST_MKD': format_price(cells[7].text),
        'TotalTurnoverMKD': format_price(cells[8].text)
    }
    
    return item


def scrape_data_from_url(company, date_from, date_to):
    url = f"{BASE_URL}{company}?FromDate={date_from}&ToDate={date_to}" 
    response = requests.get(url, timeout=(25, 60))
    
    if response.status_code != 200:
        print(f"Failed to fetch data for {company} from {date_from} to {date_to}")
        return pd.DataFrame() 

    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tbody')[0].find_all('tr') if soup.find_all('tbody') else []
    data = [parse_cells(row) for row in rows]
    
    df = pd.DataFrame(data)
    return df


def fetch_data_for_company(company, years_back=10):
    today = datetime.today()
    end_date = format_date(today)
    start_date = format_date(today - timedelta(days=365 * years_back))
    
    company_data = []
    for year in range(years_back):
        date_to = format_date(today - timedelta(days=365 * year))
        date_from = format_date(today - timedelta(days=365 * (year + 1)))
        
        yearly_data = scrape_data_from_url(company, date_from, date_to)
        if not yearly_data.empty:
            company_data.append(yearly_data)
    
    if company_data:
        final_df = pd.concat(company_data, ignore_index=True)
        final_df.to_csv(f'./data/{company}.csv', index=False)
        print(f"Data for {company} saved to ./data/{company}.csv")
    else:
        print(f"No data collected for {company}")


def fetch_data_for_all_companies_threaded(years_back=10):
    start_time = time.time()
    threads = []
    
    def worker(company):
        print(f"Fetching data for {company}")
        fetch_data_for_company(company, years_back)
    
    for company in companies:
        thread = Thread(target=worker, args=(company,))
        threads.append(thread)
        thread.start()
    
    for thread in threads:
        thread.join()
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total time taken to fetch data: {elapsed_time / 60:.2f} minutes")


fetch_data_for_all_companies_threaded()


Fetching data for ADIN
Fetching data for ALK
Fetching data for ALKB
Fetching data for AMBR
Fetching data for AMEH
Fetching data for APTK
Fetching data for ATPP
Fetching data for AUMK
Fetching data for BANA
Fetching data for BGOR
Fetching data for BIKF
Fetching data for BIM
Fetching data for BLTU
Fetching data for CBNG
Fetching data for CDHV
Fetching data for CEVI
Fetching data for CKB
Fetching data for CKBKO
Fetching data for DEBA
Fetching data for DIMI
Fetching data for EDST
Fetching data for ELMA
Fetching data for ELNC
Fetching data for ENER
Fetching data for ENSA
Fetching data for EUHA
Fetching data for EUMK
Fetching data for EVRO
Fetching data for FAKM
Fetching data for FERS
Fetching data for FKTL
Fetching data for FROT
Fetching data for FUBT
Fetching data for GALE
Fetching data for GDKM
Fetching data for GECK
Fetching data for GECT
Fetching data for GIMS
Fetching data for GRDN
Fetching data for GRNT
Fetching data for GRSN
Fetching data for GRZDFetching data for GTC

Fetching data 

Data for PPIV saved to ./data/PPIV.csv
Data for SOLN saved to ./data/SOLN.csv
Data for MPT saved to ./data/MPT.csv
Data for MPOL saved to ./data/MPOL.csv
Data for VROS saved to ./data/VROS.csv
Data for RADE saved to ./data/RADE.csvData for RZIT saved to ./data/RZIT.csv

Data for RZUG saved to ./data/RZUG.csv
Data for NOSK saved to ./data/NOSK.csv
Data for SPAZ saved to ./data/SPAZ.csv
Data for OKTA saved to ./data/OKTA.csv
Data for STBP saved to ./data/STBP.csv
Data for ZUAS saved to ./data/ZUAS.csvData for TRDB saved to ./data/TRDB.csvData for MB saved to ./data/MB.csvData for POPK saved to ./data/POPK.csv


Data for SPAZP saved to ./data/SPAZP.csv

Data for PROD saved to ./data/PROD.csv
Data for MTUR saved to ./data/MTUR.csvData for MZHE saved to ./data/MZHE.csvData for MODA saved to ./data/MODA.csvData for SKP saved to ./data/SKP.csvData for SDOM saved to ./data/SDOM.csv
Data for RZIZ saved to ./data/RZIZ.csv




Data for ZIMS saved to ./data/ZIMS.csv
Data for NEME saved to ./data/N