In [19]:
from ingestion.search import crawlee_endpoint
import requests

keywords = [
    "esg", "results", "investors", "ir", "sustainability", "sustainable", 
    "sustainability", "investor", "presentation", "earnings", "governance",
    "release"
    ]

res = requests.get(
        crawlee_endpoint + "v0/scrape-urls/", 
        json=dict(
            urls=["https://investor.mastercard.com/overview/"], 
            keywords=keywords
        )
    )

In [33]:
import os
from ingestion.search import search_pdf_urls
from ingestion.parse import  submit_request, get_all_tasks, aget_all_task_results, get_task_results
from ingestion.ingest import create_documents_from_results, create_index

In [63]:
# urls = search_pdf_urls(
#     urls=["https://investor.mastercard.com/overview/default.aspx"],
#     keywords=["esg", "results", "investors", "ir", "sustainability", "sustainable", "investor"]
#     )
import pandas as pd
import numpy as np

urls = [
    "https://xiaomi.gcs-web.com/static-files/56248048-5970-4288-88fd-75839de418aa"
    "https://cdn.cnbj1.fds.api.mi-img.com/staticsfile/svhc/2023%E5%B9%B4%E5%8F%AF%E6%8C%81%E7%BB%AD%E5%8F%91%E5%B1%95%E7%BD%91%E7%AB%99/Xiaomi%20Corporation%20TCFD%20Report.pdf",
    "https://xiaomi.gcs-web.com/static-files/e3d574cd-0f29-4f32-aeca-69aa82d42994",
    "https://ir.mi.com/static-files/e6caf854-0a06-42a8-8a59-30a3831db153",
    "https://cdn.cnbj1.fds.api.mi-img.com/staticsfile/svhc/%E5%B0%8F%E7%B1%B3%E9%9B%86%E5%9B%A2%20%20%E7%8E%AF%E5%A2%83%E7%BB%A9%E6%95%88%E5%A3%B0%E6%98%8E%20%E8%8B%B1%E6%96%87.pdf",
    "https://ir.mi.com/system/files-encrypted/nasdaq_kms/assets/2024/04/25/5-36-08/2023%20Annual%20Report.pdf",
    "https://ir.mi.com/system/files-encrypted/nasdaq_kms/assets/2024/08/21/6-09-16/Xiaomi%20Corp_24Q2_ER_ENG_v22_upload.pdf",
    "https://ir.mi.com/system/files-encrypted/nasdaq_kms/assets/2024/08/21/5-44-48/Announcement.pdf"
    ]

tasks = {}
for url in urls:
    task = submit_request(url).json()
    if "task_id" in task:
        tasks[task["task_id"]] = {"url": url}

In [81]:
task_df = pd.DataFrame(get_all_tasks())
task_df = task_df .T.sort_values(by="start_time")
task_df ["duration"] = task_df.finish_time - task_df.start_time
task_df["url"] = task_df.index.to_series().map(tasks).apply(
    lambda x: x.get("url") if isinstance(x, dict) else np.nan)

In [82]:
task_df.dropna(subset=["url"])

Unnamed: 0,status,start_time,finish_time,duration,url
28fdfc2f-f900-4673-8668-c0580d739026,completed,1725437568.334058,1725437911.047804,342.713746,https://xiaomi.gcs-web.com/static-files/562480...
05531875-89ca-40f7-9b81-39868e753d26,completed,1725437572.709316,1725437838.402302,265.692986,https://xiaomi.gcs-web.com/static-files/e3d574...
447246e4-5946-428f-b967-c3c091b0c244,completed,1725437574.791939,1725438016.063213,441.271274,https://ir.mi.com/static-files/e6caf854-0a06-4...
6361e4d5-815b-43f6-9ce7-80140d2aad2d,completed,1725437577.014618,1725437646.261224,69.246606,https://cdn.cnbj1.fds.api.mi-img.com/staticsfi...
ba5b0957-ede1-4fff-81f6-0fdc42f87eec,completed,1725437578.337688,1725438119.230145,540.892456,https://ir.mi.com/system/files-encrypted/nasda...
92c9296a-9770-4f1e-b719-b1e357b73b3c,completed,1725437582.32629,1725437881.998055,299.671765,https://ir.mi.com/system/files-encrypted/nasda...
26557107-7dc2-4286-b7de-40eaf4bdd0e5,completed,1725437583.118207,1725438041.850032,458.731824,https://ir.mi.com/system/files-encrypted/nasda...


In [83]:
results = {}
for task_id in task_df.index:
    result = await get_task_results(task_id)
    results[task_id] = result

In [113]:
import datetime as dt 

additional_metadata = dict(
    company_name="Xiaomi Corp",
    composite_figi="BBG00KVBNBT5",
    isin="KYG9830T1067",
    sedol="BG0ZMJ9",
    publish_year=2024,
    report_year=2023,
    access_datetime=dt.datetime.today().isoformat()
)

processed_results = []
for task_id, result in results.items():
    url = task_df.loc[task_id, "url"]
    if isinstance(url, str):
        data = result.json()
        for item in data:
            item["metadata"].update(additional_metadata)
            item["metadata"].update(dict(source=url))
            match url:
                case "https://xiaomi.gcs-web.com/static-files/56248048-5970-4288-88fd-75839de418aa":
                    item["metadata"].update(dict(report_type="esg_report"))
                case "https://ir.mi.com/system/files-encrypted/nasdaq_kms/assets/2024/04/25/5-36-08/2023%20Annual%20Report.pdf":
                    item["metadata"].update(dict(report_type="annual_report"))
        processed_results.append(data)

In [114]:
documents = []
for result in processed_results:
    doc = create_documents_from_results(result)
    documents.append(doc)

In [115]:
import itertools
xiaomi_index = create_index(collection_name="dev_esg_collection", documents=list(itertools.chain(*documents)))