### First: Split the data in batchs for your given usage tier

In [None]:
import pandas as pd

# Loading the raw data 
df = pd.read_csv('./your_df.csv')

In [None]:
import json
import math
import os

def create_jsonl(df, output_base_name, output_dir, batch_size=1000):
    """
    Splits the DataFrame into multiple .jsonl files, each containing up to batch_size rows, 
    and stores the files in the specified directory.
    
    Args:
    - df: DataFrame containing the input data.
    - output_base_name: Base name for the output files (e.g., 'output' will generate 'output_1.jsonl', 'output_2.jsonl', etc.).
    - output_dir: Directory where the files will be stored.
    - batch_size: Number of rows per .jsonl file (default is 20,000 rows).
    """
    
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    # Calculate how many files are needed
    total_rows = len(df)
    total_files = math.ceil(total_rows / batch_size)

    for i in range(total_files):
        # Define the filename for the current batch
        output_file = os.path.join(output_dir, f"{output_base_name}_{i + 1}.jsonl")

        # Select the subset of rows for this batch
        start_row = i * batch_size
        end_row = min(start_row + batch_size, total_rows)
        df_batch = df.iloc[start_row:end_row]

        # Create the .jsonl file and write each row in JSON format
        with open(output_file, 'w') as file:
            for _, row in df_batch.iterrows():
                # Extract the content of 'text' and the 'id_post'
                prompt = row['text'] # variable to classify
                custom_id = row['id_post']
                
                # Create the dictionary in the format
                json_line = {
                    "custom_id": f"{custom_id}",  
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o-mini-2024-07-18",
                        "messages": [
                            {"role": "system", "content": "...context here..."},
                            {"role": "user", "content": f"""
                            ...Your prompt goes here. Please classify the following text:....   {prompt}
                             """
                            }
                        ],
                        "max_tokens": 200
                    }
                }

                # Write the line to the file in JSON format
                file.write(json.dumps(json_line) + '\n')

In [None]:
create_jsonl(df, batch_size=30000, output_base_name="input_batch", output_dir="./input_batchs/")

# 50.000 maximun number of requests per batch
# You might want to create a special folder for this task

# Now we need to upload every .jsonl to the OpenAI platform

In [None]:
from dotenv import load_dotenv
from openai import OpenAI

# load .env file to environment
load_dotenv()

# my key for this project
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 

# start openAI client using the key 
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
import re
import openai

# Function to upload all .jsonl files from a folder to OpenAI and save file IDs
def upload_jsonl_files(folder_path, output_file="uploaded_file_ids.csv"):
    """
    Upload all .jsonl files from a folder to OpenAI, using the original filename for each upload.
    The files are uploaded in numerical order.
    After each successful upload, the file ID is saved to a CSV file.

    Args:
    - folder_path: Path to the folder containing the .jsonl files.
    - output_file: The name of the CSV file where the file IDs will be saved (default: "uploaded_file_ids.csv").
    """
    
    # Load existing file IDs from CSV if the file exists
    if os.path.exists(output_file):
        df_ids = pd.read_csv(output_file)
    else:
        df_ids = pd.DataFrame(columns=["filename", "file_id"])

    # Use regex to extract numbers from filenames and sort them numerically
    def extract_number(filename):
        match = re.search(r'(\d+)', filename)
        return int(match.group(1)) if match else float('inf')
    
    # List all files in the folder and filter .jsonl files, then sort them by number
    jsonl_files = sorted(
        [f for f in os.listdir(folder_path) if f.endswith(".jsonl")],
        key=extract_number
    )

    # Iterate over the sorted .jsonl files and upload them
    for filename in jsonl_files:
        file_path = os.path.join(folder_path, filename)

        # Check if the file has already been uploaded (skip if found in the CSV)
        if filename in df_ids["filename"].values:
            print(f"{filename} has already been uploaded. Skipping.")
            continue

        try:
            # Upload the file to OpenAI using client.files.create()
            with open(file_path, 'rb') as f:
                batch_input_file = client.files.create(
                    file=f,
                    purpose="batch"
                )
            
            # Print the response to verify successful upload
            print(f"Uploaded {filename} - Response: {batch_input_file}")

            # Collect the file ID and append it to the DataFrame
            file_id = batch_input_file.id
            new_entry = pd.DataFrame({"filename": [filename], "file_id": [file_id]})
            df_ids = pd.concat([df_ids, new_entry], ignore_index=True)

            # Save the updated DataFrame to the CSV file in the current working directory
            df_ids.to_csv(output_file, index=False)
            print(f"File ID saved to {output_file}")

        except Exception as e:
            # Print error message and continue with the next file
            print(f"Error uploading {filename}: {e}")
            continue

In [None]:
upload_jsonl_files("./input_batchs/")

# Now it is time to start the batch job for every jsonl file uploaded

In [None]:
# Function to create batch jobs for files using their file_id from a CSV file
def create_batch_jobs(csv_file="uploaded_file_ids.csv", start_file=1, end_file=None, batch_ids_file="batch_ids.csv"):
    """
    Create batch jobs for files listed in a CSV file (with filename and file_id).
    Iterates over a range of files (from start_file to end_file) and creates batch jobs for them.
    Saves the batch ID and filename to a CSV file (batch_ids_file) after each successful batch creation.

    Args:
    - csv_file: The CSV file containing 'filename' and 'file_id' (default: "uploaded_file_ids.csv").
    - start_file: The starting file number (default: 1).
    - end_file: The ending file number (default: None, meaning all files).
    - batch_ids_file: The CSV file to save 'batch_id' and 'filename' (default: "batch_ids.csv").
    """
    
    # Load the CSV with file IDs and filenames
    df = pd.read_csv(csv_file)

    # Determine the end_file if not provided (default to all files)
    if end_file is None:
        end_file = len(df)

    # Load existing batch IDs if the CSV already exists, else create an empty DataFrame
    if os.path.exists(batch_ids_file):
        df_batches = pd.read_csv(batch_ids_file)
    else:
        df_batches = pd.DataFrame(columns=["filename", "batch_id"])

    # Adjust for zero-based indexing in the DataFrame (start_file - 1)
    for idx, row in df.iloc[start_file - 1:end_file].iterrows():
        filename = row['filename']
        file_id = row['file_id']

        # Generate the batch name based on the filename
        batch_name = f"batch_{filename}"
        
        try:
            # Create the batch job using client.batches.create()
            batch = client.batches.create(
                input_file_id=file_id,
                endpoint="/v1/chat/completions",
                completion_window="24h",
                metadata={
                    "description": filename  # Use the filename in the description
                }
            )
            
            # Print the response to verify successful batch creation
            print(f"Batch {batch_name} created with description: {filename} - Response: {batch}")

            # Extract the batch_id from the response and append to the DataFrame
            batch_id = batch.id
            new_entry = pd.DataFrame({"filename": [filename], "batch_id": [batch_id]})
            df_batches = pd.concat([df_batches, new_entry], ignore_index=True)

            # Save the updated DataFrame to the CSV file
            df_batches.to_csv(batch_ids_file, index=False)
            print(f"Batch ID for {filename} saved to {batch_ids_file}")

        except Exception as e:
            # Print error message and continue with the next batch
            print(f"Error creating batch for {filename}: {e}")
            continue

In [None]:
create_batch_jobs("uploaded_file_ids.csv", start_file=1, end_file=15)

#For this task you need to know how many tokens you can process at the same time, depending on your usage tier. 

# Get the ID of each output file to download it sequentially

In [None]:
# Function to retrieve batch information and update the CSV with output_file_id
def update_batch_output_file_id(csv_file="batch_ids.csv"):
    """
    Retrieve batch information for each batch_id in the CSV and update the CSV with the output_file_id.
    If output_file_id already exists for a row, the row is skipped.

    Args:
    - csv_file: The CSV file containing 'filename', 'batch_id', and optionally 'output_file_id'.
    """

    # Load the CSV file with batch IDs
    df = pd.read_csv(csv_file)

    # Check if the 'output_file_id' column exists, if not, create it
    if 'output_file_id' not in df.columns:
        df['output_file_id'] = None

    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        batch_id = row['batch_id']
        filename = row['filename']
        output_file_id = row['output_file_id']

        # Skip the row if the output_file_id is already populated
        if pd.notna(output_file_id):
            print(f"Output file ID for {filename} already exists. Skipping.")
            continue

        try:
            # Retrieve the batch information from OpenAI API
            batch_info = client.batches.retrieve(batch_id)

            # Extract the output_file_id
            new_output_file_id = batch_info.output_file_id

            # Check if the output_file_id is not empty
            if new_output_file_id:
                # Update the DataFrame with the new output_file_id
                df.at[idx, 'output_file_id'] = new_output_file_id

                # Save the updated DataFrame to the CSV file
                df.to_csv(csv_file, index=False)
                print(f"Output file ID for {filename} saved to {csv_file}")
            else:
                print(f"Output file ID for {filename} is empty. Skipping.")

        except Exception as e:
            # Print error message and continue with the next batch
            print(f"Error retrieving batch {batch_id}: {e}")
            continue

In [None]:
update_batch_output_file_id("batch_ids.csv")

# Dowloading every resulting .jsonl file after the batch jobs finished

In [None]:
import json

# Function to download multiple files from OpenAI API based on a CSV file and save them
def download_and_process_multiple_files(csv_file, download_folder):
    """
    Download multiple files from OpenAI API based on the IDs in a CSV file, save them as .jsonl files,
    and use the 'filename' column to name the saved files.

    Args:
    - csv_file: The CSV file containing 'output_file_id' and 'filename'.
    - download_folder: The folder where the .jsonl files will be saved.
    """
    # Create the download folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        output_file_id = row['output_file_id']
        filename = row['filename']

        # Skip rows where output_file_id is missing or empty
        if pd.isna(output_file_id) or output_file_id == "":
            print(f"Output file ID for {filename} is missing. Skipping.")
            continue

        try:
            # Define the save path for the .jsonl file
            save_path = os.path.join(download_folder, filename)

            # Download the content from OpenAI API using the output_file_id
            file_response = client.files.content(output_file_id)
            
            # Save the retrieved content to a .jsonl file
            with open(save_path, "w", encoding="utf-8") as file:
                file.write(file_response.text)
            print(f"File {filename} downloaded and saved to {save_path}")
            
        except Exception as e:
            # Print error message if something goes wrong
            print(f"Error downloading file {filename} with ID {output_file_id}: {e}")
            continue

In [None]:
# Example usage:
csv_file = "batch_ids.csv"  # The CSV file containing 'filename' and 'output_file_id'
download_folder = "./resulting_batches/"  # The folder where files will be saved

# You might want to create a special folder for this task

# Download and process all files in the CSV
download_and_process_multiple_files(csv_file, download_folder)

# Colleting the relevant information from every output .jsonl

In [None]:

def load_jsonl_to_dataframe(folder_path, id_column, content_column):
    """
    Loads all .jsonl files from a specified folder, extracts relevant data (custom_id and assistant content),
    and stores them in a DataFrame.
    
    Args:
    - folder_path: Path to the folder containing the .jsonl files.
    - id_column: The name of the column to store the 'custom_id' values.
    - content_column: The name of the column to store the 'content' from the assistant role.
    
    Returns:
    - A DataFrame with the specified columns filled with data from the .jsonl files.
    """
    
    # Initialize an empty list to hold the extracted data
    data = []

    # Iterate over all .jsonl files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):
            file_path = os.path.join(folder_path, filename)
            
            # Open and read the .jsonl file
            with open(file_path, 'r') as file:
                for line in file:
                    # Parse the JSON line
                    record = json.loads(line)
                    
                    # Extract the 'custom_id'
                    custom_id = record.get('custom_id')
                    
                    # Extract the 'content' where the role is 'assistant'
                    content = None
                    response_body = record.get('response', {}).get('body', {})
                    choices = response_body.get('choices', [])
                    if choices:
                        # Check if the message role is 'assistant'
                        message = choices[0].get('message', {})
                        if message.get('role') == 'assistant':
                            content = message.get('content')
                    
                    # Append to data if both custom_id and content are found
                    if custom_id and content:
                        data.append({id_column: custom_id, content_column: content})
    
    # Convert the list of dictionaries to a DataFrame
    output_df = pd.DataFrame(data)
    
    return output_df

In [None]:
output_df = load_jsonl_to_dataframe("./resulting_batches/", id_column="post_id", content_column="reasoning")

In [None]:
output_df.to_csv("./complete_classification.csv", index=False)