In [None]:
!pip install sqlalchemy==1.4.22
!pip install arviz==0.16.0
!pip install chex==0.1.7
!pip install cloudpathlib==0.16.0
!pip install flax==0.8.0
!pip install ibis-framework==7.1.0
!pip install librosa==0.10.0
!pip install openai==1.11.1
!pip install pydantic==1.9.0
!pip install python-utils==3.8.2

!pip uninstall typing_extensions
!pip install typing_extensions

!pip install fastapi kaleido python-multipart uvicorn cohere tiktoken
!pip install lida==0.0.10

!pip install typing-extensions==3.10.0.2

!pip install openai

In [38]:
from openai import OpenAI

In [39]:
import pandas as pd

In [40]:
df = pd.read_csv('/content/bank_complaint.csv')
df.head()

Unnamed: 0,ticket,subclass,class,class id
0,I had recently received a check for a settleme...,Problems caused by my funds being low,bank_account,1
1,"At Bank Of America, I have had my personal & b...",Problems caused by my funds being low,bank_account,1
2,"I Bank with Wells fargo, I went in to deposit ...",Problems caused by my funds being low,bank_account,1
3,Charges for my purchases were shown pending on...,Problems caused by my funds being low,bank_account,1
4,I made purchases after my direct deposit was m...,Problems caused by my funds being low,bank_account,1


In [41]:
df['class'].value_counts()

bank_account    100
credit_card     100
loan            100
mortgage        100
Name: class, dtype: int64

In [42]:
df['subclass'].value_counts()

Student loan                               56
Consumer Loan                              43
Conventional fixed mortgage                25
Managing an account                        25
FHA mortgage                               24
Using a debit or ATM card                  22
Problems caused by my funds being low      20
VA mortgage                                20
Customer service                           17
Conventional adjustable mortgage (ARM)     17
Deposits and withdrawals                   15
Billing disputes                           15
Advertising and marketing                  15
Conventional home mortgage                 14
Payoff process                             13
Fraud                                      12
Account opening, closing, or management     7
Company charging the account                7
Cancelling account                          6
Rewards                                     6
Problem caused by your funds being low      4
Credit determination              

The classes are having equal number of samples, but the subclasses are not, there is imbalance. But lets first proceed with original dataset. In the second part, we will remove samples corresponding to less frequent subclasses OR we will include synthetic data corresponding to that subclasses.

In [43]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.30, stratify = df['class'], random_state = 42)

val_df, test_df = train_test_split(val_df, test_size= 0.5, stratify = val_df['class'], random_state=42)

print(len(df), len(train_df), len(val_df), len(test_df))

400 280 60 60


In [44]:
# the dataframes are sampled such that they have equal number of samples per class (not the subclass)

In [45]:
def gpt35_data_format(df):   # it is available in https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
  dataset_json = []
  for i, row in df.iterrows():
    response = '{"class": "' + row['class'] + '", "subclass": "' + row['subclass'] + '"}'
    dataset_json.append({"messages": [{"role": "user", "content": row['ticket']},{"role": 'assistant',"content": response}]})
    # print(dataset_json)
    # break
  return dataset_json

train_data = gpt35_data_format(train_df)
val_data = gpt35_data_format(val_df)
test_data = gpt35_data_format(test_df)


In [46]:
train_data[0]

{'messages': [{'role': 'user',
  {'role': 'assistant',
   'content': '{"class": "bank_account", "subclass": "Managing an account"}'}]}

In [47]:
import json
json.loads(train_data[0]["messages"][-1]["content"])

{'class': 'bank_account', 'subclass': 'Managing an account'}

JSONL (json new line) format

In [48]:
import json
def json_new_line(data, file_name):
  with open(file_name, 'w') as file:
    for row in data:
      json.dump(row, file)
      file.write('\n')

train_file = 'train.jsonl'
val_file = 'val.jsonl'

json_new_line(train_data, train_file)
json_new_line(val_data, val_file)

In [49]:
import openai

Uploading training and validation files

In [None]:
from openai import OpenAI
client = OpenAI(api_key = "ENTER YOUR OPENAI API KEY HERE")  # generate API key from OpenAI website

training_file = client.files.create(
  file=open(train_file, "rb"),
  purpose="fine-tune"
)

validation_file = client.files.create(
  file=open(val_file, "rb"),
  purpose="fine-tune"
)

print("Training File ID: ", training_file.id)
print("Validation File ID: ", validation_file.id)

Creating fine-tuned Model

In [None]:
suffix_name = 'fine-tune-gpt3.5'

response = client.fine_tuning.jobs.create(
  training_file=training_file.id,
  validation_file=validation_file.id,
  model="gpt-3.5-turbo",
  suffix=suffix_name
)
response

In [None]:
client.fine_tuning.jobs.list(limit=5)

In [None]:
response = client.fine_tuning.jobs.retrieve("ftjob-Lcx2e7jcqBGcCcPlRtpsRzVZ")
response

In [None]:
fine_tuned_model_id = response.fine_tuned_model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def format_test(row):
    formatted_message = [{"role": "user", "content": row['ticket']}]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )
    return response.choices[0].message.content

def store_predictions(test_df, fine_tuned_model_id):
    test_df['Prediction'] = None
    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        test_df.at[index, 'Prediction'] = prediction_result

    test_df.to_csv("predictions.csv")

store_predictions(test_df, fine_tuned_model_id)

In [None]:
preds = pd.read_csv('/content/predictions.csv')
preds.head()

Unnamed: 0.1,Unnamed: 0,ticket,subclass,class,class id,Prediction
0,201,"""I consolidated my loans several years ago for...",Student loan,loan,3,"{""class"": ""loan"", ""subclass"": ""Student loan""}"
1,267,This is in regards to a private student loan f...,Student loan,loan,3,"{""class"": ""loan"", ""subclass"": ""Student loan""}"
2,63,I had recently received a check for a settleme...,Problems caused by my funds being low,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Using a..."
3,57,Opted out of overdraft coverage to keep from i...,Problems caused by my funds being low,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Problem..."
4,350,My current mortgage company - Greentree Servic...,Conventional fixed mortgage,mortgage,4,"{""class"": ""mortgage"", ""subclass"": ""FHA mortgage""}"


In [None]:
import ast
correct_class, correct_subclass = 0, 0
for i, row in preds.iterrows():
    # print(type(ast.literal_eval(row['Prediction'])))
    if row['class'] == ast.literal_eval(row['Prediction'])['class']:
        correct_class += 1
    if row['subclass'] == ast.literal_eval(row['Prediction'])['subclass']:
        correct_subclass += 1

print("Accuracy of Class: ", round(100 * correct_class/len(preds)))
print("Accuracy of Subclass: ", round(100 * correct_subclass/len(preds)))

Accuracy of Class:  95
Accuracy of Subclass:  58


# Method 2: Removing Rows corresponding to less frequent subclasses

In [52]:
import pandas as pd
df = pd.read_csv('/content/bank_complaint.csv')

In [53]:
df = df.groupby('subclass').filter(lambda x: len(x)>20)

In [54]:
df['subclass'].value_counts()

Student loan                   56
Consumer Loan                  43
Managing an account            25
Conventional fixed mortgage    25
FHA mortgage                   24
Using a debit or ATM card      22
Name: subclass, dtype: int64

In [55]:
df['class'].value_counts()

loan            99
mortgage        49
bank_account    47
Name: class, dtype: int64

In [56]:
df.groupby('class').nunique()

Unnamed: 0_level_0,ticket,subclass,class id
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_account,47,2,1
loan,99,2,1
mortgage,49,2,1


In [57]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.30, stratify = df['class'], random_state = 42)

val_df, test_df = train_test_split(val_df, test_size= 0.5, stratify = val_df['class'], random_state=42)

print(len(df), len(train_df), len(val_df), len(test_df))

195 136 29 30


In [58]:
def gpt35_data_format(df):   # it is available in https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
  dataset_json = []
  for i, row in df.iterrows():
    response = '{"class": "' + row['class'] + '", "subclass": "' + row['subclass'] + '"}'
    dataset_json.append({"messages": [{"role": "user", "content": row['ticket']},{"role": 'assistant',"content": response}]})
  return dataset_json

train_data = gpt35_data_format(train_df)
val_data = gpt35_data_format(val_df)
test_data = gpt35_data_format(test_df)


In [59]:
import json
def json_new_line(data, file_name):
  with open(file_name, 'w') as file:
    for row in data:
      json.dump(row, file)
      file.write('\n')

train_file = 'train.jsonl'
val_file = 'val.jsonl'

json_new_line(train_data, train_file)
json_new_line(val_data, val_file)

In [None]:
from openai import OpenAI
client = OpenAI(api_key = "ENTER YOUR OPENAI API KEY HERE")  # generate API key from OpenAI website

training_file = client.files.create(
  file=open(train_file, "rb"),
  purpose="fine-tune"
)

validation_file = client.files.create(
  file=open(val_file, "rb"),
  purpose="fine-tune"
)

print("Training File ID: ", training_file.id)
print("Validation File ID: ", validation_file.id)

In [None]:
suffix_name = 'ft-gpt3.5-method2'

response = client.fine_tuning.jobs.create(
  training_file=training_file.id,
  validation_file=validation_file.id,
  model="gpt-3.5-turbo",
  suffix=suffix_name
)
response

In [24]:
# client.fine_tuning.jobs.list(limit=5)  # to see the last 5 jobs details

In [None]:
response = client.fine_tuning.jobs.retrieve("ftjob-VyOYodMzg8dmf5zniVfTvpIJ")  # copy the fine-tuning job id
response

In [84]:
fine_tuned_model_id = response.fine_tuned_model

def format_test(row):
    formatted_message = [{"role": "user", "content": row['ticket']}]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )
    return response.choices[0].message.content

def store_predictions(test_df, fine_tuned_model_id):
    test_df['Prediction'] = None
    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        test_df.at[index, 'Prediction'] = prediction_result

    test_df.to_csv("predictions.csv")

store_predictions(test_df, fine_tuned_model_id)

In [73]:
preds = pd.read_csv('/content/predictions.csv')
preds.head()

Unnamed: 0.1,Unnamed: 0,ticket,subclass,class,class id,Prediction
0,99,Paypal has closed my account down & permanentl...,Managing an account,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Managin..."
1,31,I have chase premier checking account. When i ...,Managing an account,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Managin..."
2,11,I recently filed several transactions with PNC...,Using a debit or ATM card,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Managin..."
3,95,I was a customer of a local bank named PNC and...,Managing an account,bank_account,1,"{""class"": ""bank_account"", ""subclass"": ""Managin..."
4,201,"""I consolidated my loans several years ago for...",Student loan,loan,3,"{""class"": ""loan"", ""subclass"": ""Student loan""}"


In [74]:
import ast
correct_class, correct_subclass = 0, 0
for i, row in preds.iterrows():
    # print(type(ast.literal_eval(row['Prediction'])))
    if row['class'] == ast.literal_eval(row['Prediction'])['class']:
        correct_class += 1
    if row['subclass'] == ast.literal_eval(row['Prediction'])['subclass']:
        correct_subclass += 1

print("Accuracy of Class: ", round(100 * correct_class/len(preds)))
print("Accuracy of Subclass: ", round(100 * correct_subclass/len(preds)))

Accuracy of Class:  93
Accuracy of Subclass:  80


# UI

In [None]:
!pip install Gradio

In [None]:
!pip install openai==0.28

In [None]:
!pip install openai --upgrade

In [103]:
# testing
import ast
user_prompt = "There have been unauthorized transactions on my debit card, and I'm worried about the security of my account. I haven't shared my card details, so I'm not sure how these transactions occurred. Can you please investigate and help me secure my account?"
formatted_message = [{"role": "user", "content": user_prompt}]
response = client.chat.completions.create(
    model=fine_tuned_model_id, messages=formatted_message, temperature=0, max_tokens=50
)
myd = ast.literal_eval(response.choices[0].message.content)

print('class: \t ', myd['class'], '\nsubclass:', myd['subclass'])

class: 	  bank_account 
subclass: Managing an account


In [None]:
import gradio as gr
import ast

def generate_completion(user_prompt):
    formatted_message = [{"role": "user", "content": user_prompt}]
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=formatted_message, temperature=0, max_tokens=50
    )
    myd = ast.literal_eval(response.choices[0].message.content)
    return [myd['class'], myd['subclass']]

iface = gr.Interface(fn=generate_completion,
                     inputs=gr.Textbox(label="Enter Customer Complaint"),
                     outputs=[gr.Textbox(label="Predicted Class Category"),gr.Textbox(label="Predicted Subclass Category")],
                     title="Bank Customer Complaint Ticket Classifier")

iface.launch(share=True)