### Demonstrate the use of pretrained embedding to classfiy text data with well-described categories

In [54]:
from dotenv import load_dotenv
import openai
import os
from pathlib import Path  # Python 3.6+ only
import json

env_path = Path('../') / 'secrets.env'
load_dotenv(dotenv_path=env_path)
key = os.getenv("AZURE_OPENAI_API_KEY")
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_type = "azure"
openai.api_version = "2023-05-15" 
openai.api_key= key
openai.api_base=openai_endpoint
engine = "gpt-35-turbo"

## Load data

In [42]:
import os
import json
import pandas as pd
folder_path ="data"
file_name = f"customer_support_transcripts.txt"
# with open(os.path.join(folder_path,file_name), 'w') as file:
#     file.write(json.dumps(contents))

loaded_content= json.load(open(os.path.join(folder_path,file_name)))
loaded_content
cat_sub = pd.unique([(content[1],content[2]) for content in  loaded_content])
categories = pd.unique([item[0] for item in cat_sub])
cat_sub = {item[1]:item[0] for item in cat_sub}
cat_sub


{'Payment plan options': 'Billing inquiries',
 'Bill dispute': 'Billing inquiries',
 'Late payment charges': 'Billing inquiries',
 'Subscription cancellation': 'Billing inquiries',
 'Account balance': 'Billing inquiries',
 'Auto-renewal': 'Billing inquiries',
 'Payment confirmation': 'Billing inquiries',
 'Refund': 'Billing inquiries',
 'Payment methods': 'Billing inquiries',
 'Invoice request': 'Billing inquiries',
 'Credit card update': 'Billing inquiries',
 'Plan upgrade/downgrade': 'Billing inquiries',
 'Change in billing information': 'Billing inquiries',
 'Billing cycle': 'Billing inquiries',
 'Payment history': 'Billing inquiries',
 'Automatic payments': 'Billing inquiries',
 'Failed payment': 'Billing inquiries',
 'Payment due date extension': 'Billing inquiries',
 'Prorated charges': 'Billing inquiries',
 'Charge details': 'Billing inquiries',
 'Installation issues': 'Product usability',
 'Connection problems': 'Mobile app technical issues',
 'Performance concerns': 'Technical

### Approach 1: using instruction prompt 

In [51]:
def classify_cat_gpt(content):
    user_message =f""" 
    <<content>>
    {content}
    <<content>>
    Given the above content, classify it into one of the following categories \n {categories}",\n 
    The category of the content is :
 
"""

    response = openai.ChatCompletion.create(
        engine=engine, # engine = "deployment_name".
        messages=[
            {"role": "system", "content": "You are a customer support data analyst"},
            {"role": "user", "content":user_message },
        ]
    )
    return response['choices'][0]['message']['content']
prediction_ds =[]

for content in loaded_content:
    n=0

    while (n<4):
        predicted_cat=""
        try:

            predicted_cat = classify_cat_gpt(content[0])
            assert predicted_cat in categories        
            break
        except:
            n+=1
    prediction_ds.append((content[1],predicted_cat))


In [52]:
scores = [item[0]==item[1] for item in prediction_ds]
accuracy = sum(scores)/len(scores)
print(accuracy)

0.5467532467532468


## Approach 2: using embedding

In [46]:
from openai.embeddings_utils import get_embedding, cosine_similarity


In [47]:
content_emb = [get_embedding(x[0], engine = 'text-embedding-ada-002') for x in loaded_content]


#### Prepare the category description : embedding map

In [48]:
#get the category descriptions (unlike normal classification, the more detail, more clear, more distinctive description the better)
# produce the embedding for each category description 
import numpy as np
cat_emb = {cat:get_embedding(cat, engine = 'text-embedding-ada-002')for cat in categories}

In [49]:
import numpy as np  
  
import numpy as np  
  
def classify_cat_emb(input_vector):  
    """  
    Given an input vector and a dictionary of label vectors,  
    returns the label with the highest cosine similarity to the input vector.  
    """  
    max_similarity = float('-inf')  
    max_label = None  
      
    # Compute cosine similarity between input vector and each label vector  
    for label, vector in cat_emb.items():  
        cosine_similarity = np.dot(input_vector, vector) / (np.linalg.norm(input_vector) * np.linalg.norm(vector))  
          
        # Update max_similarity and max_label if necessary  
        if cosine_similarity > max_similarity:  
            max_similarity = cosine_similarity  
            max_label = label  
      
    return max_label  


### Prediction or matching

In [50]:
predictions = [classify_cat_emb(vec) for vec in content_emb]
labels =[content[1] for content in loaded_content]
predictions = [item[0]==item[1] for  item in zip(predictions, labels )]
accuracy = sum(predictions)/len(predictions)
accuracy

0.4909090909090909