In [2]:
import glob
import os
import json
import pandas as pd
from tqdm import tqdm
import time
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from scripts.report_agent import ReportAgent
from datetime import datetime
from typing import List, Literal

load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [3]:
# Fungsi untuk membuat direktori jika belum ada
def create_output_directory(output_path: str):
    """
    Ensure that the output directory exists.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Fungsi untuk memproses satu putaran data (first turn atau second turn)
def process_turn(query_prompt: str, schema_class: BaseModel, input_dir: str, output_dir: str, input_file: str = None, glob_path: str = "*/*.json"):
    """
    Process a single round of data extraction (first or second turn) and save the results to the specified output directory.
    
    Args:
    - query_prompt (str): The query prompt for the report extraction.
    - schema_class (BaseModel): The schema class for validating the extracted data.
    - input_dir (str): The directory containing input JSON files.
    - output_dir (str): The directory where the results will be saved.
    - input_file (str, optional): If provided, the function will filter data based on this file (e.g., the first turn results).
    - glob_path (str, optional): The glob pattern to match input files. Default is "*/*.json".
    """
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Define the list of files to process based on the glob pattern
    files = glob.glob(os.path.join(input_dir, glob_path))
    files = files[21:]
    
    # Check if files exist, and raise an error if not
    if not files:
        raise FileNotFoundError(f"No files matched the glob pattern: '{glob_path}' in directory '{input_dir}'")
    
    # Process each file
    for file in tqdm(files, desc="Processing Companies"):
        try:
            company = os.path.splitext(os.path.basename(file))[0]
            agent = ReportAgent(os.environ["LLM"], schema_class, query_prompt)
            
            # Handle data processing for first or second turn
            if input_file:
                # For the second turn, filter data based on the input Excel file
                fdata = pd.read_excel(input_file)
                fdata = fdata[fdata['company'] == company]
                fdata = fdata.drop("sources", axis=1)
                
                # Loop through each row of filtered data
                for idx, row in fdata.iterrows():
                    prompt_data = row.to_dict()
                    result = agent.get_data(prompt_data)
                    result['data'].update(prompt_data)
                    
                    # Generate output path and save result
                    relative_path = os.path.relpath(file, input_dir)
                    output_path = os.path.join(output_dir, relative_path).replace(".json", f"_{idx+1}.json")
                    create_output_directory(output_path)
                    with open(output_path, 'w') as f:
                        json.dump(result, f, indent=4)
            else:
                # For the first turn, directly use the company data
                result = agent.get_data({'company': company})
                result['data']['company'] = company
                
                # Generate output path and save result
                relative_path = os.path.relpath(file, input_dir)
                output_path = os.path.join(output_dir, relative_path)
                create_output_directory(output_path)
                with open(output_path, 'w') as f:
                    json.dump(result, f, indent=4)
                    
        except Exception as e:
            print(f"\nError processing {company}: {str(e)}")
        
        # Pause to avoid overloading the API
        time.sleep(5)
    
        # After processing all files, save results to an Excel file
        save_results_to_excel(output_dir, glob_path)

# Fungsi untuk menyimpan hasil pemrosesan ke dalam file Excel
def save_results_to_excel(output_dir: str, glob_path: str):
    """
    Save the processed results into an Excel file after extraction.
    """
    output_files = glob.glob(os.path.join(output_dir, glob_path))
    output_list = []
    
    for file in output_files:
        with open(file, 'r') as f:
            result = json.load(f)
        result['data'].update({'sources': result['sources']})
        output_list.append(result['data'])
    
    output_df = pd.DataFrame(output_list)
    output_df.to_excel(f"{output_dir}/result.xlsx", index=False)

### Start here

First Turn
- Siapkan query prompt dengan placeholder {company}
- Siapkan schema / variabel yang ingin di-crawl
- Siapkan `input_dir`, `output_dir`, dan `glob_path`

    `input_dir` dan `glob_path` saling berhubungan. Input direktori adalah tempat file json disimpan.

    Misalnya, file json terletak di path `data/JSON/AR/ID_ADRO_AR_2022.json`. 

    Maka `input_dir` bisa berupa `data`, lalu `glob_path`-nya adalah `*/*/*.json`
    ATAU jika `input_dir` berupa `data/JSON`, lalu `glob_path`-nya adalah `*/*.json`

In [4]:
from typing import List, Literal
from pydantic import BaseModel, Field

class EducationEntry(BaseModel):
    major: str | None = Field(
        default=None,
        description="The major or field of study pursued.",
        example="Business Administration"
    )
    university: str | None = Field(
        default=None,
        description="The name of the educational institution (e.g., university or college).",
        example="Harvard University"
    )
    country: str | None = Field(
        default=None,
        description="The country where the educational institution is located.",
        example="USA"
    )

class WorkingExperienceEntry(BaseModel):
    position: str = Field(
        description="The job title or position held during the work experience.",
        example="Chief Operating Officer"
    )
    company: str | None = Field(
        default=None,
        description="The name of the company or organization where the work experience took place.",
        example="Google"
    )
    country: str | None = Field(
        default=None,
        description="The country where the work experience took place.",
        example="USA"
    )
    start_date: str | None = Field(
        default=None,
        description="The start date of this work experience in YYYY-MM-DD format (e.g., '2020-01-01').",
        example="2020-01-01"
    )
    end_date: str | None = Field(
        default=None,
        description="The end date of this work experience in YYYY-MM-DD format (e.g., '2022-12-31'). Use None if the role is ongoing.",
        example="2022-12-31"
    )

class CEOBackground(BaseModel):
    name: str = Field(
        description="The full name of the CEO (or equivalent role such as President Director, Managing Director, or Chairman).",
        example="Jane Doe"
    )
    position: str = Field(
        description="The official title or designation of the CEO.",
        example="CEO"
    )
    start_tenure: str | None = Field(
        default=None,
        description="The date when the CEO began their tenure in YYYY-MM-DD format (e.g., '2018-05-01'). Use None if unavailable.",
        example="2018-05-01"
    )
    end_tenure: str | None = Field(
        default=None,
        description="The date when the CEO ended their tenure in YYYY-MM-DD format, or None if currently serving.",
        example=None
    )
    gender: Literal['Male', 'Female'] = Field(
        description="The gender of the CEO. Must be either 'Male' or 'Female'.",
        example="Female"
    )
    birthdate_or_age: str | int | None = Field(
        default=None,
        description="Either the CEO's birthdate (as a YYYY-MM-DD formatted string) or their age (as an integer) at the time of reporting.",
        example="1965-07-15"
    )
    nationality: str = Field(
        description="The nationality of the CEO.",
        example="American"
    )
    education_bachelor: EducationEntry = Field(
        description="Details of the CEO's bachelor's degree education.",
        example={
            "major": "Economics",
            "university": "University of California, Berkeley",
            "country": "USA"
        }
    )
    education_master: EducationEntry = Field(
        description="Details of the CEO's master's degree education.",
        example={
            "major": "Finance",
            "university": "Massachusetts Institute of Technology",
            "country": "USA"
        }
    )
    education_doctoral: EducationEntry = Field(
        description="Details of the CEO's doctoral degree education.",
        example={
            "major": "Business Administration",
            "university": "Stanford University",
            "country": "USA"
        }
    )
    working_experience: List[WorkingExperienceEntry] = Field(
        default_factory=list,
        description="A list of the CEO's previous work experiences.",
        example=[
            {
                "position": "Chief Operating Officer",
                "company": "Google",
                "country": "USA",
                "start_date": "2020-01-01",
                "end_date": "2022-12-31"
            }
        ]
    )


query_prompt_1 = """
Please search for comprehensive and up-to-date information on the CEO or President Director of the company {company}.

You may use detailed queries for the retriever tool multiple times to ensure the information is accurate.

Your results must include the following details, structured according to the JSON schema provided below:

1. **Basic Information:**
   - **Name:** The full name of the CEO (e.g., "Jane Doe").
   - **Position:** The official title or designation (e.g., "CEO").
   - **Start Tenure:** The date when the CEO began their tenure (in YYYY-MM-DD format, e.g., "2018-05-01"). Use None if unavailable.
   - **End Tenure:** The date when the CEO ended their tenure (in YYYY-MM-DD format), or None if they are still serving.
   - **Gender:** The gender of the CEO. Must be either "Male" or "Female" (e.g., "Female").
   - **Birthdate or Age:** Either the CEO's birthdate (as a YYYY-MM-DD formatted string) or their age (as an integer) at the time of reporting (e.g., "1965-07-15").
   - **Nationality:** The nationality of the CEO (e.g., "American").

2. **Educational Background:**  
   For each degree, please provide the following:
   - **Bachelor's Degree ("education_bachelor")** with:
     - **Major:** Field of study (e.g., "Economics").
     - **University:** Name of the institution (e.g., "University of California, Berkeley").
     - **Country:** Location of the institution (e.g., "USA").
   - **Master's Degree ("education_master")** with similar details (e.g., "Finance" from "Massachusetts Institute of Technology", "USA").
   - **Doctoral Degree ("education_doctoral")** with similar details (e.g., "Business Administration" from "Stanford University", "USA").

3. **Working Experience:**  
   A list of previous work experiences, where each entry should include:
   - **Position:** The job title held (e.g., "Chief Operating Officer").
   - **Company:** The name of the company or organization where the work experience took place (e.g., "Google").
   - **Country:** The country where this experience took place (e.g., "USA").
   - **Start Date:** When the work experience started (in YYYY-MM-DD format, e.g., "2020-01-01").
   - **End Date:** When the work experience ended (in YYYY-MM-DD format, e.g., "2022-12-31"). Use None if the role is still ongoing.
"""

In [5]:
# FIRST TURN

# Proses first turn
process_turn(
    query_prompt=query_prompt_1,
    schema_class=CEOBackground,
    input_dir="data/TCFD REPORT JSON",
    output_dir="results_2nd_turn",
    glob_path="*/*/*.json"
)

Processing Companies: 100%|██████████| 631/631 [6:21:02<00:00, 36.23s/it]  


Second Turn

- Siapkan query prompt yang memiliki placeholder sesuai variabel yang ingin dicari, dimana variabel tersebut seharusnya sudah didapatkan dari first turn.
- Masukkan parameter tambahan yaitu `input_file` yang merupakan hasil dari path excel hasil dari first turn

In [54]:
# SECOND TURN

# QUERY PROMPT
query_prompt_2 = """
Retrieve the bachelor education degree of {ceo_name} which is ceo of {company} from the company's report. 
"""

# SCHEMA
class EducationEntry(BaseModel):
    major: str = Field(description="The academic program or field of study pursued by the ceo.", example="Business Administration")
    university: str = Field(description="The name of the university where the ceo pursued their education.", example="Harvard University")
    country: str = Field(description="The country where the ceo completed their education.", example="United States")

# Proses second turn
process_turn(
    query_prompt=query_prompt_2,
    schema_class=EducationEntry,
    input_dir="data/JSON",
    output_dir="results_2nd_turn",
    input_file="results_1st_turn/result.xlsx",
    glob_path="*/*.json"
)

Processing Companies:   0%|          | 0/1 [00:00<?, ?it/s]

{'ceo_name': 'Garibaldi Thohir', 'company': 'ID_ADRO_AR_2022'}


[1m> Entering new Invoke Agent chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': "Garibaldi Thohir bachelor's degree education ID_ADRO_AR_2022"}`


[0m[36;1m[1;3m[Document(id='8563ab1b-926e-48d5-a3ec-e3eca3c23acc', metadata={'company': 'ID_ADRO_AR_2022', 'page': 123.0}, page_content='Citizenship I Kewarganegaraan\n\nIndonesian I Indonesia\n\n## Age I Usia\n\n57 as at 31 December 2022\n\n57 per tanggal 31 Desember 2022\n\n## Education History I Riwayat Pendidikan\n\nBachelor of Business Administration, University of Southern California, USA MBA, Northrop University, California, USA S1 Business Administration, University of Southern California, AS S2 MBA, Northrop University, California, AS\n\n## Legal Basis of Appointment at PT Adaro Energy Indonesia Tbk\n\nDasar Hukum Pengangkatan di PT Adaro Energy Indonesia Tbk\n\nDeed No. 62 of April 18, 2008, and re-appointed based on Deed No.31 of May 20, 2021

Processing Companies: 100%|██████████| 1/1 [00:38<00:00, 38.12s/it]
