Prepare data

In [None]:
import json
def convert_json_to_jsonl(input_file: str, output_file: str):
    # Load the JSON file
    with open(input_file, 'r', encoding='utf-8') as json_file:
        json_data = json.load(json_file)
    
    # Convert JSON to JSONL and write to output file
    with open(output_file, 'w', encoding='utf-8') as jsonl_file:
        for entry in json_data:
            jsonl_file.write(json.dumps(entry) + '\n')

input_json_file = 'data.json'  # Replace with your input file path
output_jsonl_file = 'data.jsonl'  # Replace with your output file path

convert_json_to_jsonl(input_json_file, output_jsonl_file)

Fine tuning teacher model

In [7]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [9]:
# Upload Training 
training_file = client.files.create(
    file=open("data.jsonl", "rb"), purpose="fine-tune"
)
print("Training file id: ", training_file.id)

Training file id:  file-Ru1c5UJwxwCNVKBkze8I3I6F


In [10]:

# Create Fine-Tuning Job
suffix_name = "llm"
response = client.fine_tuning.jobs.create(
    training_file='file-Ru1c5UJwxwCNVKBkze8I3I6F',
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)
print(response)

FineTuningJob(id='ftjob-9Rp36wVMvyizIpqtaFurkCW1', created_at=1727878288, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-uXFhRQZlPYdmwJvwmyYmTROo', result_files=[], seed=534979875, status='validating_files', trained_tokens=None, training_file='file-Ru1c5UJwxwCNVKBkze8I3I6F', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='llm')


Check Finetuning jobs

In [8]:
response = client.fine_tuning.jobs.retrieve("ftjob-9Rp36wVMvyizIpqtaFurkCW1")
response

FineTuningJob(id='ftjob-9Rp36wVMvyizIpqtaFurkCW1', created_at=1727878288, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:imutably-oy:llm:ADudBVBQ', finished_at=1727879315, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-uXFhRQZlPYdmwJvwmyYmTROo', result_files=['file-E8uZSc8IGReo70k5bIaSvbWJ'], seed=534979875, status='succeeded', trained_tokens=335838, training_file='file-Ru1c5UJwxwCNVKBkze8I3I6F', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='llm')

In [9]:
fine_tuned_model_id = response.fine_tuned_model
print('\nFine tuned model id: ', fine_tuned_model_id)


Fine tuned model id:  ft:gpt-3.5-turbo-0125:imutably-oy:llm:ADudBVBQ


Use FineTuning Models

In [19]:
import csv
import os
import json
import aiohttp
import asyncio
from dotenv import load_dotenv

class HANDLER:
    @staticmethod
    def remove_empty_lines(content):
        lines = content.split("\n")
        cleaned_lines = [line.strip() for line in lines if line.strip()]
        return "\n".join(cleaned_lines)

    @staticmethod
    async def call_api_teacher(data_safety_content, privacy_policy_content, retries=3, delay=5):
        url = "https://api.openai.com/v1/chat/completions"
        
        headers = {
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",  # Use OpenAI API key
            "Content-Type": "application/json"
        }

        # Payload for OpenAI's GPT model
        payload = json.dumps({
            "model": "ft:gpt-3.5-turbo-0125:imutably-oy:llm:ADudBVBQ",  # Ensure this model ID is correct
            "temperature": 0.5,  
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert in labeling the content comparison of the Data Safety and Privacy Policy for an Android application."
                },
                {
                    "role": "user",
                    "content": f"Based on the content of the Data Safety and Privacy Policy. Please label according to the following rule:\n> Label 1: Incorrect if the content is mentioned in Data Safety but not in the Privacy Policy. Correct if the content is mentioned in both documents.\n> Label 2: Incomplete if the content is mentioned in Data Safety but not in the Privacy Policy. Complete if the content is mentioned in both documents.\nThe contents of the two documents are below.\n> Data Safety: {data_safety_content}\n> Privacy Policy: {privacy_policy_content}\nNote: The shortest answer and no explanation needed, in the format: {{label 1: Incorrect or Correct, label 2: Incomplete or Complete}}"
                }
            ]
        })

        timeout = aiohttp.ClientTimeout(total=60)  # Increase timeout to prevent premature failures

        # Retry loop
        for attempt in range(retries):
            try:
                async with aiohttp.ClientSession(timeout=timeout) as session:
                    async with session.post(url, headers=headers, data=payload) as response:
                        # Check if the response content type is JSON
                        if response.headers['Content-Type'] == 'application/json':
                            response_json = await response.json()
                            if 'error' in response_json:
                                return f"Error: {response_json['error']}"
                            
                            # Extracting the content from the message in choices array
                            content = response_json["choices"][0]["message"]["content"]
                            return content
                        else:
                            # If not JSON, read the response as text
                            response_text = await response.text()
                            return f"Unexpected response: {response_text}"
            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < retries - 1:
                    await asyncio.sleep(delay)  # Wait before retrying
                else:
                    return f"Error: Request failed after {retries} retries."

    @staticmethod
    async def loop_csv(input_csv_path, output_csv_path):
        with open(input_csv_path, "r", newline="", encoding="utf-8") as csvfile, open(
            output_csv_path, "w", newline="", encoding="utf-8"
        ) as outputfile:

            reader = csv.reader(csvfile)
            writer = csv.writer(outputfile)

            # Write custom header row
            writer.writerow(["App_id", "App_pkg", "Data_safety_content", "Privacy_policy_content", "Result"])

            for index, row in enumerate(reader):
                print(f"\n_____________ Run times {index + 1} <{row[0]}> _____________")
                
                # Extracting required data
                app_id = row[0]
                app_pkg = row[1]
                data_safety_content = row[3]
                privacy_policy_content = row[4]

                # Asynchronous API call with retry and timeout handling
                assistant_reply = await HANDLER.call_api_teacher(data_safety_content, privacy_policy_content)

                # Check for errors in the API response and write to CSV
                if "Error" in assistant_reply:
                    writer.writerow([app_id, app_pkg, data_safety_content, privacy_policy_content, "Error"])
                else:
                    writer.writerow([app_id, app_pkg, data_safety_content, privacy_policy_content, assistant_reply])
            
                print("~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~\n")

In [20]:
async def main():
    load_dotenv() 
    print(os.getcwd())
    input_csv_path = "../../data/phase-01/200v2.csv" 
    output_csv_path = "../../output/phase-01/experiment-02/output-teacher.csv"
    await HANDLER().loop_csv(input_csv_path, output_csv_path)
   

await main()

d:\nckh\conf-llm-nlp\src\experiment-02

_____________ Run times 1 <app_id> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 2 <1> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 3 <1> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 4 <2> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 5 <3> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 6 <6> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 7 <6> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 8 <7> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 9 <31> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 10 <31> _____________
Attempt 1 failed: 
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_____________ Run times 11 <33> _____________
~~~~~~~~~~~~~~ Success ~~~~~~~~~~~~~~


_______