# MNE Groups Data Extraction Challenge - MUR team

Phase 1: Data extraction via API FMP (https://site.financialmodelingprep.com/)

#### Load libraries

In [None]:
import pandas as pd
import requests
import os
import yaml
from datetime import datetime

#### Configuration

In [None]:
def load_config(config_file='config.yaml'):
    """
    Loads configuration parameters from a YAML file.

    Args:
        config_file (str): The path to the YAML configuration file.

    Returns:
        dict: A dictionary containing the configuration parameters.
    """
    with open(config_file, 'r') as f:
        return yaml.safe_load(f)

In [None]:
config = load_config()

DATA_PATH = config['data_path']
INPUT_FILENAME = config['input_filename']
BASE_URL = config['base_url']
API_KEY = config['api_key']
MAX_REQUESTS = config['max_request']
PROGRESS_FILE = os.path.join(DATA_PATH, "processed_ids.txt")

#### Functions

In [None]:
def load_processed_ids(filepath):
    """
    Appends a list of processed IDs to a progress file.

    Args:
        ids (list): A list of IDs that have been processed in the current run.
        progress_file (str): The path to the file where processed IDs are stored.
    """
    if not os.path.exists(filepath):
        return set()
    with open(filepath, "r") as f:
        return set(line.strip() for line in f)

In [None]:
def save_processed_ids(new_ids, filepath):
    """
    Loads a set of already processed IDs from a progress file.

    Args:
        progress_file (str): The path to the file containing previously processed IDs.

    Returns:
        set: A set of strings, where each string is a previously processed ID.
    """
    with open(filepath, "a") as f:
        for id_ in new_ids:
            f.write(f"{id_}\n")

In [None]:
def get_symbol_by_name(company_name):
    global request_count
    try:
        url = f"{BASE_URL}/search-name?query={company_name}&limit=1&apikey={API_KEY}"
        response = requests.get(url)
        request_count += 1
        data = response.json()
        if data:
            return data[0]
    except Exception as e:
        print(f"[ERROR] Symbol search failed for {company_name}: {e}")
    return None

In [None]:
def extract_financial_data(symbol):
    global request_count
    data = {
        "symbol": symbol,
        "country": None,
        "website": None,
        "employees": None,
        "industry": None,
        "turnover": None,
        "turnover_currency": None,
        "turnover_year": None,
        "assets": None,
        "assets_currency": None,
        "assets_year": None,
        "profile_url": f"{BASE_URL}/profile/{symbol}",
        "income_url": f"{BASE_URL}/income-statement/{symbol}",
        "balance_url": f"{BASE_URL}/balance-sheet-statement/{symbol}"
    }

    try:
        profile = requests.get(data["profile_url"] + f"?apikey={API_KEY}").json()
        request_count += 1
        profile = profile[0] if profile else {}
        data.update({
            "country": profile.get("country"),
            "website": profile.get("website"),
            "employees": profile.get("fullTimeEmployees") or profile.get("employees"),
            "industry": profile.get("industry")
        })
    except:
        pass

    try:
        income = requests.get(data["income_url"] + f"?period=FY&limit=1&apikey={API_KEY}").json()
        request_count += 1
        income = income[0] if income else {}
        data.update({
            "turnover": income.get("revenue"),
            "turnover_currency": income.get("reportedCurrency"),
            "turnover_year": income.get("fiscalYear") or income.get("date", "")[:4],
        })
    except:
        pass

    try:
        balance = requests.get(data["balance_url"] + f"?period=FY&limit=1&apikey={API_KEY}").json()
        request_count += 1
        balance = balance[0] if balance else {}
        data.update({
            "assets": balance.get("totalAssets"),
            "assets_currency": balance.get("reportedCurrency"),
            "assets_year": balance.get("fiscalYear") or balance.get("date", "")[:4]
        })
    except:
        pass

    return data

In [None]:
def process_companies(df_ids, max_requests=247):
    """
    Processes a list of pending companies to extract financial data using the FMP API.

    Args:
        pending_companies (pd.DataFrame): DataFrame containing companies to process,
                                          with 'ID' and 'NAME' columns.
        max_requests (int): The maximum number of API requests to make in this run.

    Returns:
        tuple: A tuple containing:
            - list: A list of dictionaries, where each dictionary represents a row of extracted data.
            - list: A list of IDs of companies successfully processed in this run.
    """
    results = []
    processed_ids = []

    for _, row in df_ids.iterrows():
        if request_count >= max_requests:
            print("Max request count reached.")
            break

        company_id = str(row["ID"])
        company_name = row["NAME"]

        print(f"Processing: {company_name} (ID: {company_id})")
        symbol_data = get_symbol_by_name(company_name)
        processed_ids.append(company_id)

        if not symbol_data:
            print(f"Symbol not found for {company_name}")
            continue

        symbol = symbol_data["symbol"]
        fin_data = extract_financial_data(symbol)

        year = fin_data["turnover_year"] or fin_data["assets_year"]

        results += [
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "COUNTRY",
                "SRC": fin_data["profile_url"],
                "VALUE": fin_data["country"],
                "CURRENCY": "N/A",
                "REFYEAR": year
            },
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "EMPLOYEES",
                "SRC": fin_data["profile_url"],
                "VALUE": fin_data["employees"],
                "CURRENCY": "N/A",
                "REFYEAR": year
            },
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "TURNOVER",
                "SRC": fin_data["income_url"],
                "VALUE": fin_data["turnover"],
                "CURRENCY": fin_data["turnover_currency"],
                "REFYEAR": fin_data["turnover_year"]
            },
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "ASSETS",
                "SRC": fin_data["balance_url"],
                "VALUE": fin_data["assets"],
                "CURRENCY": fin_data["assets_currency"],
                "REFYEAR": fin_data["assets_year"]
            },
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "WEBSITE",
                "SRC": fin_data["profile_url"],
                "VALUE": fin_data["website"],
                "CURRENCY": "N/A",
                "REFYEAR": year
            },
            {
                "ID": company_id,
                "NAME": company_name,
                "VARIABLE": "ACTIVITY",
                "SRC": fin_data["profile_url"],
                "VALUE": fin_data["industry"],
                "CURRENCY": "N/A",
                "REFYEAR": year
            },
        ]

    return results, processed_ids

#### Main

In [None]:
request_count = 0
df = pd.read_csv(os.path.join(DATA_PATH, INPUT_FILENAME), sep=";")
unique_ids = df[["ID", "NAME"]].drop_duplicates(subset=['ID'])
already_done = load_processed_ids(PROGRESS_FILE)
pending = unique_ids[~unique_ids["ID"].astype(str).isin(already_done)].reset_index(drop=True)

processed_rows, ids_this_run = process_companies(pending, max_requests=MAX_REQUESTS)

if processed_rows:
    output_df = pd.DataFrame(processed_rows)
    run_id = datetime.now().strftime("%Y%m%d_%H%M")
    output_file = os.path.join(DATA_PATH, f"extraction_{run_id}.csv")
    output_df.to_csv(output_file, sep=";", index=False)
    print(f"Saved {len(processed_rows)} rows to {output_file}")

save_processed_ids(ids_this_run, PROGRESS_FILE)
print(f"Updated processed IDs. Requests used: {request_count}")