In [None]:
import glob
import os
import json
import pandas as pd
from tqdm import tqdm
import time
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from scripts.report_agent import ReportAgent

load_dotenv()

In [None]:
# Fungsi untuk membuat direktori jika belum ada
def create_output_directory(output_path: str):
    """
    Ensure that the output directory exists.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Fungsi untuk memproses satu putaran data (first turn atau second turn)
def process_turn(query_prompt: str, schema_class: BaseModel, input_dir: str, output_dir: str, input_file: str = None, glob_path: str = "*/*.json"):
    """
    Process a single round of data extraction (first or second turn) and save the results to the specified output directory.
    
    Args:
    - query_prompt (str): The query prompt for the report extraction.
    - schema_class (BaseModel): The schema class for validating the extracted data.
    - input_dir (str): The directory containing input JSON files.
    - output_dir (str): The directory where the results will be saved.
    - input_file (str, optional): If provided, the function will filter data based on this file (e.g., the first turn results).
    - glob_path (str, optional): The glob pattern to match input files. Default is "*/*.json".
    """
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Define the list of files to process based on the glob pattern
    files = glob.glob(os.path.join(input_dir, glob_path))
    
    # Check if files exist, and raise an error if not
    if not files:
        raise FileNotFoundError(f"No files matched the glob pattern: '{glob_path}' in directory '{input_dir}'")
    
    # Process each file
    for file in tqdm(files, desc="Processing Companies"):
        try:
            company = os.path.splitext(os.path.basename(file))[0]
            agent = ReportAgent(os.environ["LLM"], schema_class, query_prompt)
            
            # Handle data processing for first or second turn
            if input_file:
                # For the second turn, filter data based on the input Excel file
                fdata = pd.read_excel(input_file)
                fdata = fdata[fdata['company'] == company]
                fdata = fdata.drop("sources", axis=1)
                
                # Loop through each row of filtered data
                for idx, row in fdata.iterrows():
                    prompt_data = row.to_dict()
                    result = agent.get_data(prompt_data)
                    result['data'].update(prompt_data)
                    
                    # Generate output path and save result
                    relative_path = os.path.relpath(file, input_dir)
                    output_path = os.path.join(output_dir, relative_path).replace(".json", f"_{idx+1}.json")
                    create_output_directory(output_path)
                    with open(output_path, 'w') as f:
                        json.dump(result, f, indent=4)
            else:
                # For the first turn, directly use the company data
                result = agent.get_data({'company': company})
                result['data']['company'] = company
                
                # Generate output path and save result
                relative_path = os.path.relpath(file, input_dir)
                output_path = os.path.join(output_dir, relative_path)
                create_output_directory(output_path)
                with open(output_path, 'w') as f:
                    json.dump(result, f, indent=4)
                    
        except Exception as e:
            print(f"\nError processing {company}: {str(e)}")
        
        # Pause to avoid overloading the API
        time.sleep(30)
    
    # After processing all files, save results to an Excel file
    save_results_to_excel(output_dir)

# Fungsi untuk menyimpan hasil pemrosesan ke dalam file Excel
def save_results_to_excel(output_dir: str):
    """
    Save the processed results into an Excel file after extraction.
    """
    output_files = glob.glob(os.path.join(output_dir, "*", "*.json"))
    output_list = []
    
    for file in output_files:
        with open(file, 'r') as f:
            result = json.load(f)
        result['data'].update({'sources': result['sources']})
        output_list.append(result['data'])
    
    output_df = pd.DataFrame(output_list)
    output_df.to_excel(f"{output_dir}/result.xlsx", index=False)

### Start here

First Turn
- Siapkan query prompt dengan placeholder {company}
- Siapkan schema / variabel yang ingin di-crawl
- Siapkan `input_dir`, `output_dir`, dan `glob_path`

    `input_dir` dan `glob_path` saling berhubungan. Input direktori adalah tempat file json disimpan.

    Misalnya, file json terletak di path `data/JSON/AR/ID_ADRO_AR_2022.json`. 

    Maka `input_dir` bisa berupa `data`, lalu `glob_path`-nya adalah `*/*/*.json`
    ATAU jika `input_dir` berupa `data/JSON`, lalu `glob_path`-nya adalah `*/*.json`

In [53]:
# FIRST TURN

# QUERY PROMPT
query_prompt_1 = """
Retrieve the name of the ceo for {company} from the company's report.
"""

# SCHEMA
class GovernanceBody(BaseModel):
    ceo_name: str = Field(description="Nama CEO atau presiden direktur dari perusahaan", example="Maula Irfani")

# Proses first turn
process_turn(
    query_prompt=query_prompt_1,
    schema_class=GovernanceBody,
    input_dir="data/JSON",
    output_dir="results_1st_turn",
    glob_path="*/*.json"
)

Processing Companies:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new Invoke Agent chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': 'CEO name in ID_ADRO_AR_2022 company report'}`


[0m[36;1m[1;3m[Document(id='c3ec5584-4d43-4683-aadc-d3ae716f9e0f', metadata={'company': 'ID_ADRO_AR_2022', 'page': 502.0}, page_content="## STATEMENT OF RESPONSIBILITY\n\n## PENYATAAN TANGGUNG JAWAB MANAJEMEN\n\nThe Board of Commissioners and Board of Directors of PT Adaro Energy Indonesia Tbk hereby state that all information contained within the 2022 Annual Report of PT Adaro Energy Indonesia Tbk has been presented comprehensively, hence we assume full responsibility for the accuracy of information under the Company's Annual Report.\n\nApril 19, 2023\n\nDewan Komisaris dan Direksi PT Adaro Energy Indonesia Tbk dengan ini menyatakan bahwa semua informasi dalam Laporan Tahunan PT Adaro Energy Indonesia Tbk tahun 2022 telah disajikan secara lengkap dan oleh karenanya kami bertanggung jawab penuh atas kebeneran isi Laporan Tahunan Per

Processing Companies: 100%|██████████| 1/1 [00:44<00:00, 44.90s/it]


Second Turn

- Siapkan query prompt yang memiliki placeholder sesuai variabel yang ingin dicari, dimana variabel tersebut seharusnya sudah didapatkan dari first turn.
- Masukkan parameter tambahan yaitu `input_file` yang merupakan hasil dari path excel hasil dari first turn

In [54]:
# SECOND TURN

# QUERY PROMPT
query_prompt_2 = """
Retrieve the bachelor education degree of {ceo_name} which is ceo of {company} from the company's report. 
"""

# SCHEMA
class EducationEntry(BaseModel):
    major: str = Field(description="The academic program or field of study pursued by the ceo.", example="Business Administration")
    university: str = Field(description="The name of the university where the ceo pursued their education.", example="Harvard University")
    country: str = Field(description="The country where the ceo completed their education.", example="United States")

# Proses second turn
process_turn(
    query_prompt=query_prompt_2,
    schema_class=EducationEntry,
    input_dir="data/JSON",
    output_dir="results_2nd_turn",
    input_file="results_1st_turn/result.xlsx",
    glob_path="*/*.json"
)

Processing Companies:   0%|          | 0/1 [00:00<?, ?it/s]

{'ceo_name': 'Garibaldi Thohir', 'company': 'ID_ADRO_AR_2022'}


[1m> Entering new Invoke Agent chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': "Garibaldi Thohir bachelor's degree education ID_ADRO_AR_2022"}`


[0m[36;1m[1;3m[Document(id='8563ab1b-926e-48d5-a3ec-e3eca3c23acc', metadata={'company': 'ID_ADRO_AR_2022', 'page': 123.0}, page_content='Citizenship I Kewarganegaraan\n\nIndonesian I Indonesia\n\n## Age I Usia\n\n57 as at 31 December 2022\n\n57 per tanggal 31 Desember 2022\n\n## Education History I Riwayat Pendidikan\n\nBachelor of Business Administration, University of Southern California, USA MBA, Northrop University, California, USA S1 Business Administration, University of Southern California, AS S2 MBA, Northrop University, California, AS\n\n## Legal Basis of Appointment at PT Adaro Energy Indonesia Tbk\n\nDasar Hukum Pengangkatan di PT Adaro Energy Indonesia Tbk\n\nDeed No. 62 of April 18, 2008, and re-appointed based on Deed No.31 of May 20, 2021

Processing Companies: 100%|██████████| 1/1 [00:38<00:00, 38.12s/it]
