In [None]:
pip install openai

Collecting openai
  Downloading openai-1.9.0-py3-none-any.whl (223 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.4/223.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions<5,>=4.7 (from openai)
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0

In [12]:
from openai import OpenAI
from pathlib import Path
from google.colab import drive
import os
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple

In [None]:

gdrive_path='/content/gdrive/MyDrive/nlp'

drive.mount('/content/gdrive', force_remount=True)

os.chdir(gdrive_path)



Mounted at /content/gdrive


# GPT Fine Tuning

### Creating Fine-Tuned data

In [None]:
base_path = '/content/gdrive/MyDrive/nlp'

project_dir = Path(base_path)

In [15]:
trainings_data = pd.read_csv(project_dir / "training_data_preprocessed.csv")
test_data = pd.read_csv(project_dir / "gold_standard_preprocessed.csv")


In [18]:
trainings_data.shape

(1669, 4)

In [19]:
test_data.shape

(266, 4)

In [35]:
def create_gpt_prompt(row: pd.Series) -> Dict[str, list]:
    """
    Creates a GPT-3 prompt from a given row of a DataFrame.

    Parameters:
        row (pd.Series): A row from a DataFrame.

    Returns:
        dict: A dictionary containing the GPT-3 prompt.
    """
    user_message = f"Process Description: {row['Process_description']}.\nLegal Text to Classify: {row['Text']}\n"
    assistant_message = f"{row['Label']}"
    return {
        "messages": [
            {
                "role": "system",
                "content": "You are an business compliance expert.\nYour task is to determine if the legal text is relevant to the process description.\nYour answer needs to be either 0=Not Relevant or 1=Relevant.",
            },
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": assistant_message},
        ]
    }

In [34]:
def create_gpt_test_prompt(row: pd.Series) -> Dict[str, list]:
    """
    Creates a GPT-3 test prompt from a given row of a DataFrame.

    Parameters:
        row (pd.Series): A row from a DataFrame.

    Returns:
        dict: A dictionary containing the GPT-3 test prompt.
    """
    user_message = f"Process Description: {row['Process_description']}.\nLegal Text to Classify: {row['Text']}\n"

    return {
        "messages": [
            {
                "role": "system",
                "content": "You are an business compliance expert.\nYour task is to determine if the legal text is relevant to the process description.\nYour answer needs to be either 0=Not Relevant or 1=Relevant.",
            },
            {"role": "user", "content": user_message},
        ]
    }

In [25]:
def create_gpt3_5_fine_tuning_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates GPT-3.5 fine-tuning data from a given DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the training and validation DataFrames.
    """
    data["gpt_prompt"] = data.apply(create_gpt_prompt, axis=1)
    train, val = train_test_split(data, test_size=0.20, random_state=42)
    train_gpt = train["gpt_prompt"].tolist()
    val_gpt = val["gpt_prompt"].tolist()

    # Save the formatted data
    with open(project_dir/"gpt/train_gpt_finetuning.jsonl", "w") as file:
        for item in train_gpt:
            file.write(f"{json.dumps(item)}\n")

    with open(project_dir/"gpt/val_gpt_finetuning.jsonl", "w") as file:
        for item in val_gpt:
            file.write(f"{json.dumps(item)}\n")

    return train, val

In [26]:
train, val = create_gpt3_5_fine_tuning_data(trainings_data)

In [29]:
train.iloc[0]["gpt_prompt"]

{'messages': [{'role': 'system',
   'content': 'You are an business compliance expert.\nYour task is to determine if the legal text is relevant to the process description.\nYour answer needs to be either 0=Not Relevant or 1=Relevant.'},
  {'role': 'user',
   'content': 'Process Description: this process be about the right to rectify from the general data protection regulation which mean every datum subject have the right to rectify its datum the process start with a rectify request the service provider check the list of datum to be correct and correct the datum the rectification be be communicate to the datum subject.\nLegal Text to Classify: the supervisory authority shall communicate those list to the board refer to in article 68\n'},
  {'role': 'assistant', 'content': '0'}]}

I used the fine-tune dashboard to upload the training and validation file and fine-tune GPT3.5 Turbo

### Prompting Fine-Tuned model to get results

In [36]:
test_data["gpt_prompt"] = test_data.apply(create_gpt_test_prompt, axis=1)

In [39]:
test_data.iloc[0]["gpt_prompt"]

{'messages': [{'role': 'system',
   'content': 'You are an business compliance expert.\nYour task is to determine if the legal text is relevant to the process description.\nYour answer needs to be either 0=Not Relevant or 1=Relevant.'},
  {'role': 'user',
   'content': "Process Description: the process for a travel insurance claim involve several important step to ensure a fair and accurate resolution for both the insured traveler and the insurance provider first when a traveler contact the insurance provider to report a claim the provider will typically ask for detailed information about the event include the necessary documentation to support the claim next the insurance provider will review the policy coverage to determine if the claim fall under the policy 's cover event if the event be cover the insurance company will then assess the claim and the support documentation to determine the amount of coverage if any that the traveler be entitle to receive depend on the complexity of th

In [45]:
client = OpenAI(api_key= "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")

In [72]:
def prompt_fine_tuned_model(prompt_json: str) -> str:
    """
    Prompts a fine-tuned model with a given JSON string.

    Parameters:
        prompt_json (str): The JSON string to prompt the model with.

    Returns:
        str: The response from the model.
    """
    prompt_dict = json.loads(prompt_json)
    try:
        response = client.chat.completions.create(
            model="ft:gpt-3.5-turbo-1106:personal::8kc0r491",
            messages=prompt_dict["messages"],
            max_tokens=1,
            temperature=0.2,
        )
        print("Response from API:", response)  # Print the entire response

        # Extract the response content correctly
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [50]:
test_data["gpt_prompt"]

0      {'messages': [{'role': 'system', 'content': 'Y...
1      {'messages': [{'role': 'system', 'content': 'Y...
2      {'messages': [{'role': 'system', 'content': 'Y...
3      {'messages': [{'role': 'system', 'content': 'Y...
4      {'messages': [{'role': 'system', 'content': 'Y...
                             ...                        
261    {'messages': [{'role': 'system', 'content': 'Y...
262    {'messages': [{'role': 'system', 'content': 'Y...
263    {'messages': [{'role': 'system', 'content': 'Y...
264    {'messages': [{'role': 'system', 'content': 'Y...
265    {'messages': [{'role': 'system', 'content': 'Y...
Name: gpt_prompt, Length: 266, dtype: object

In [75]:
test_data['GPT_Prediction']  = test_data.apply(lambda x: prompt_fine_tuned_model(json.dumps(x['gpt_prompt'])), axis=1)

Response from API: ChatCompletion(id='chatcmpl-8kdgoNrk06CXNLdR3ow0N64tiixHw', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='0', role='assistant', function_call=None, tool_calls=None))], created=1706126346, model='ft:gpt-3.5-turbo-1106:personal::8kc0r491', object='chat.completion', system_fingerprint='fp_2dae0bfd7e', usage=CompletionUsage(completion_tokens=1, prompt_tokens=271, total_tokens=272))
Response from API: ChatCompletion(id='chatcmpl-8kdgoKPrcfHyX1SJx74BQOuXsN56W', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='1', role='assistant', function_call=None, tool_calls=None))], created=1706126346, model='ft:gpt-3.5-turbo-1106:personal::8kc0r491', object='chat.completion', system_fingerprint='fp_2dae0bfd7e', usage=CompletionUsage(completion_tokens=1, prompt_tokens=249, total_tokens=250))
Response from API: ChatCompletion(id='chatcmpl-8kdgoYBmbNIcQKaTkYlgYztghYrUS', choices

In [76]:
test_data['GPT_Prediction']

0      0
1      1
2      1
3      0
4      1
      ..
261    0
262    0
263    0
264    0
265    0
Name: GPT_Prediction, Length: 266, dtype: object

Checking for robustness

Saving results for comparison

In [78]:
# Save the combined DataFrame
test_data.to_csv(project_dir / "test_data_gpt_prediction.csv", index=False)