## Import

In [30]:
import yfinance as yf
import pandas as pd 
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException


## Settings

##### Choose the index you want to scrape. Available are "dax", "mdax" and "sdax". 

In [20]:
index = "sdax" 

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  

# Set up the Chrome driver using WebDriver Manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
wait = WebDriverWait(driver, 20)

## Scraper

In [21]:
if index == "ddax":
    url = "XETR-DAX"
elif index == "mdax":
    url = "XETR-MDAX"
elif index == "sdax":
    url = "XETR-SDXP"

# Navigate to the TradingView MDAX page
url = "https://www.tradingview.com/symbols/" + url + "/components/"
driver.get(url)

try:
    # Wait for the table to load using a more reliable selector
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".table-Ngq2xrcG")))

    # Extract table rows
    rows = driver.find_elements(By.CSS_SELECTOR, ".table-Ngq2xrcG tbody tr")

    # Extract data from rows
    data = []
    for row in rows:  # No need to skip header row since we are selecting tbody directly
        ticker_cell = row.find_element(By.CSS_SELECTOR, "td span.tickerCell-GrtoTeat")
        symbol = ticker_cell.find_element(By.CSS_SELECTOR, "a.tickerNameBox-GrtoTeat").text.strip()  # Extract ticker symbol
        name = ticker_cell.find_element(By.CSS_SELECTOR, "sup.tickerDescription-GrtoTeat").text.strip()  # Extract company name
        
        data.append({"Symbol": symbol, "Name": name})

except (TimeoutException, NoSuchElementException, ElementNotInteractableException) as e:
    print(f"An error occurred: {type(e).__name__}. Message: {str(e)}")
    print(driver.page_source)  # Print page source for debugging

finally:
    # Close the browser
    driver.quit()

# Create a DataFrame from the extracted data if data is not empty
if data:
    df = pd.DataFrame(data)
    
    # Ensure the 'data' directory exists, create if it doesn't
    os.makedirs('data', exist_ok=True)
    
    # Save to CSV file in the 'data' subdirectory
    df.to_csv("data/" + index + ".csv", index=False)
    print("Data saved to data/" + index + ".csv")
else:
    print("No data extracted.")

Data saved to data/sdax.csv


## Load

In [24]:
# Load the CSV file into a DataFrame based on the index variable
df = pd.read_csv(f'data/{index}.csv')

# Print the DataFrame to verify it has been loaded correctly
print(df.head())

  Symbol                           Name
0    DWS                   DWS GROUP SE
1    SPG  SPRINGER NATURE AG & CO. KGAA
2   SHA0                  SCHAEFFLER AG
3   SIX2                        SIXT SE
4   R3NK                  RENK GROUP AG


## Enrich Data

In [31]:
import yfinance as yf
import pandas as pd
import os
import time
import random

# Update the user agent headers in yfinance
yf.utils.user_agent_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}

# Load the CSV file into a DataFrame based on the index variable
index = "mdax"  # Example index value
df = pd.read_csv(f'data/{index}.csv')

# Determine the correct column names
symbol_column = 'Symbol'
company_column = 'Name'

# Define the function to get company data from yfinance
def get_company_data(symbol):
    ticker = yf.Ticker(symbol)
    data = ticker.info
    financials = ticker.financials

    latest_financials = financials.iloc[:, 0] if not financials.empty else pd.Series()

    price = data.get('currentPrice', None)
    market_cap = data.get('marketCap', None)
    revenue = latest_financials.get('Total Revenue', None)
    operating_income = latest_financials.get('Operating Income', None)
    net_income = latest_financials.get('Net Income', None)
    book_value = data.get('bookValue', None)

    try:
        cash_flow = latest_financials.get('Operating Cash Flow', None)
        if cash_flow is None:
            cash_flow = net_income + latest_financials.get('Depreciation', 0)
    except:
        cash_flow = None

    shares_outstanding = data.get('sharesOutstanding', None)
    kgv = price / (net_income / shares_outstanding) if all(v is not None and v != 0 for v in [price, net_income, shares_outstanding]) else None
    kuv = market_cap / revenue if all(v is not None and v != 0 for v in [market_cap, revenue]) else None
    kbv = price / book_value if all(v is not None and v != 0 for v in [price, book_value]) else None
    kcv = market_cap / cash_flow if all(v is not None and v != 0 for v in [market_cap, cash_flow]) else None

    return {
        'Symbol': symbol,
        'Price': price,
        'Market Cap': market_cap,
        'Revenue': revenue,
        'Operating Income': operating_income,
        'Net Income': net_income,
        'KGV (P/E)': kgv,
        'KUV (P/S)': kuv,
        'KBV (P/B)': kbv,
        'KCV (P/CF)': kcv
    }

# Create a list of company data
company_data_list = []

# Iterate through each row in the CSV
for _, row in df.iterrows():
    symbol = row[symbol_column].strip()  # Remove any leading/trailing whitespace
    company_name = row[company_column].strip()  # Remove any leading/trailing whitespace
    print(f"Processing: {company_name} ({symbol})")  # Add this line for debugging
    try:
        company_data = get_company_data(symbol)
        company_data['Company'] = company_name
        company_data_list.append(company_data)
    except Exception as e:
        print(f"Failed to process {symbol}: {e}")
    
    # Add a delay between requests to avoid hitting the rate limit
    time.sleep(random.uniform(1, 3))

# Create a dataframe with the company data
result_df = pd.DataFrame(company_data_list)

# Reorder columns to put Symbol and Company first
columns_order = ['Symbol', 'Company'] + [col for col in result_df.columns if col not in ['Symbol', 'Company']]
result_df = result_df[columns_order]

# Print the result
print(result_df.head(5))

Processing: TALANX AG (TLX)


429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/TLX?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=TLX&crumb=Edge%3A+Too+Many+Requests


Failed to process TLX: Expecting value: line 1 column 1 (char 0)
Processing: TRATON SE (8TRA)


429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/8TRA?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=8TRA&crumb=Edge%3A+Too+Many+Requests


Failed to process 8TRA: Expecting value: line 1 column 1 (char 0)
Processing: KNORR BREMSE AG (KBX)


429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/KBX?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=KBX&crumb=Edge%3A+Too+Many+Requests


Failed to process KBX: Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 

In [None]:
# Create the 'data' subdirectory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save the DataFrame to a CSV file in the 'data' subdirectory
csv_filename = 'data/mdax_results.csv'
df.to_csv(csv_filename, index=False)
print(f"Data has been saved to {csv_filename}")



Data has been saved to data/mdax.csv
