In [3]:
import openai

In [5]:
import pandas as pd
data = pd.read_excel("../dataset/preprocessed_updated_20_search_and_cf_data-2.xlsx")
data.columns

Index(['veh_model', 'veh year', 'veh_loc', 'veh_mile', 'cust_complaint',
       'repr_comments', 'cmpnt_cat_desc', 'cmpnt_code', 'cmpnt_symp_txt',
       'TREAD_cat'],
      dtype='object')

In [7]:
data["combine_cust_rep"] = data['cust_complaint'] + " " + data["repr_comments"]
# data["combine_cust_rep"] = data.drop(['Damage Category Description'],1) # we  removed damage category. to prevent system will properly categories

In [8]:
data["combine_cust_rep"]

1     the airbag is making a buzzing noise very anno...
2     my audis fuel gauge is stuck showing full even...
4     the air suspension system of my audi seems to ...
5     the air suspension system of my audi seems to ...
7     my audis fuel gauge is stuck showing full even...
8     the air suspension system of my audi seems to ...
9     my audis fuel gauge is stuck showing full even...
10    my audis fuel gauge is stuck showing full even...
11    the air suspension system of my audi seems to ...
12    the air suspension system of my audi seems to ...
13    my audis fuel gauge is stuck showing full even...
14    the air suspension system of my audi seems to ...
15    my audis fuel gauge is stuck showing full even...
16    having blinking problem with cabin light at in...
17    light of interior cabin does not turn on repla...
Name: combine_cust_rep, dtype: object

In [9]:
formatted_examples = []
for _, row in data.iterrows():
    json_response = row["combine_cust_rep"]
    formatted_example = {"prompt": json_response, "completion": str(row["cmpnt_cat_desc"])}
    formatted_examples.append(formatted_example) 

In [10]:
formatted_examples

  'completion': 'Airbag'},
 {'prompt': 'the airbag is making a buzzing noise very annoying fixed a loose wiring of the airbag assembly ',
  'completion': 'Airbag'},
 {'prompt': 'my audis fuel gauge is stuck showing full even when its not the fuel gauge sticking issue was due to a faulty sender unit which has now been replaced gauge readings should now be accurate',
  'completion': 'Fuel Gauge'},
  'completion': 'Airbag'},
 {'prompt': 'the air suspension system of my audi seems to be malfunctioning air suspension fault traced to a leaky air strut the strut has been replaced resolving the sagging and rough ride issues',
  'completion': 'Air Suspension'},
 {'prompt': 'the air suspension system of my audi seems to be malfunctioning air suspension fault traced to a leaky air strut the strut has been replaced resolving the sagging and rough ride issues',
  'completion': 'Air Suspension'},
  'completion': 'Airbag'},
 {'prompt': 'my audis fuel gauge is stuck showing full even when its not the 

In [None]:
## dataset sampling 

In [11]:
from sklearn.model_selection import train_test_split

# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    formatted_examples,
    test_size=0.30,
    random_state=42  # for reproducibility
)


In [13]:
import json
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')
            
training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"
write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)

In [14]:
len(train_data), len(val_data), len(test_data)

(12, 4, 2)

In [15]:
from openai import OpenAI
client = OpenAI(api_key="")

# Upload Training and Validation Files
training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

# Create Fine-Tuning Job
suffix_name = "VWCFCAT"
response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="babbage-002",
    suffix=suffix_name,
    hyperparameters={
    "n_epochs":20
  }
)


In [None]:
## calculating test accuracy accuracy

In [None]:
## test data preparation

In [1]:
import pandas as pd
test_data = pd.read_excel("../dataset/output direct compalint and repr matching.xlsx")
test_data.columns

Index(['veh_model', 'veh year', 'veh_loc', 'veh_mile', 'cust_complaint',
       'repr_comments', 'combined_cust_repr', 'Matched cmpnt_cat_desc',
       'Matched cmpnt_symp_txt', 'Matched cmpnt_code'],
      dtype='object')

In [2]:
test_data["combine_cust_rep"] = test_data['cust_complaint'] + " " + test_data["repr_comments"]

In [3]:
test_formatted_examples = []
for _, row in test_data.iterrows():
    json_response = row["combine_cust_rep"]
    formatted_example = {"prompt": json_response, "completion": str(row["Matched cmpnt_cat_desc"])}
    test_formatted_examples.append(formatted_example) 

In [4]:
test_prompts = list()
real_completion = list()
for jdata in test_formatted_examples:
    test_prompts.append(jdata['prompt'])
    real_completion.append(jdata['completion'])

In [5]:
test_prompts

['the airbag light is blinking intermittently fixed a loose connection in the airbag circuit',
 'the fuel level indicator gets stuck after refueling adjusted the fuel sensor for accurate readings',
 'bad noise coming out of the airbag repaired electrical problem',
 'theres a noticeable delay in air suspension adjustment when changing driving modes replaced the air suspensions compressor for optimal performance',
 'the fuel level indicator gets stuck after refueling adjusted the fuel sensor for accurate readings',
 'theres a noticeable delay in air suspension adjustment when changing driving modes replaced the air suspensions compressor for optimal performance',
 'im experiencing a flickering issue with interior cabin light checked fuse and wiring replaced faulty cabin light switch flickering issue resolved',
 'light of interior cabin does not turn on replaced light bulb of the interior cabin that solved the problem']

In [6]:
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key="")

# Load the fine-tuned model
model_id = ""  # Adjust the suffix_name if necessary
model = client.models.retrieve(model_id)

In [11]:
# Empty list to store predictions
pred_completion = []

# Loop through each prompt in the list
for prompt_to_pred in test_prompts:
    # Generate predictions for each prompt
    response = client.completions.create(
        model=model_id,
        prompt=prompt_to_pred,
        max_tokens=3,  # Adjust as needed
    )

    # Assuming the response is successful and contains at least one choice,
    # extract the predicted text and add it to the predictions list
    if response.choices:
        predicted_text = response.choices[0].text.strip()
        pred_completion.append(predicted_text)

In [12]:
len(pred_completion), len(real_completion)

(8, 8)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn import metrics

# Generate classification report
print("Classification Report:")
print(classification_report(real_completion, pred_completion))

# Calculate overall accuracy using accuracy_score
accuracy = accuracy_score(real_completion, pred_completion)
# Precision
precision = metrics.precision_score(real_completion, pred_completion, average = 'weighted' )  # Use 'binary' for binary classification
# Recall
recall = metrics.recall_score(real_completion, pred_completion , average = 'weighted' )  # Use 'binary' for binary classification
# F1-Score
f1_score = metrics.f1_score(real_completion, pred_completion , average = 'weighted' )  # Use 'binary' for binary classification
print(f"Overall Accuracy: {accuracy:.4f}")
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1_score:.4f}')
print(f'Accuracy: {accuracy:.4f}\n\n')


Classification Report:
                           precision    recall  f1-score   support

           Air Suspension       0.00      0.00      0.00         2
Air Suspension Suspension       0.00      0.00      0.00         0
                   Airbag       0.00      0.00      0.00         2
             Airbag Light       0.00      0.00      0.00         0
                 Airbag T       0.00      0.00      0.00         0
               Fuel Gauge       0.00      0.00      0.00         2
     Interior Cabin Light       0.50      1.00      0.67         2

                 accuracy                           0.25         8
                macro avg       0.07      0.14      0.10         8
             weighted avg       0.12      0.25      0.17         8

Overall Accuracy: 0.2500
Precision: 0.1250
Recall: 0.2500
F1-Score: 0.1667
Accuracy: 0.2500




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
pred_completion

['Airbag Light',
 'Interior Cabin Light',
 'Airbag T',
 'Air Suspension Suspension',
 'Interior Cabin Light',
 'Air Suspension Suspension',
 'Interior Cabin Light',
 'Interior Cabin Light']

In [15]:
real_completion

['Airbag',
 'Fuel Gauge',
 'Airbag',
 'Air Suspension',
 'Fuel Gauge',
 'Air Suspension',
 'Interior Cabin Light',
 'Interior Cabin Light']