### Demonstrate the use of pretrained embedding to classfiy text data with well-described categories

In [None]:
import pandas as pd
import re
import openai
import re
import requests
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast
import tiktoken

API_KEY = ""
RESOURCE_ENDPOINT = "" 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})
print(r)



In [None]:
# Normalize text data util function
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s



## Data Loading

In [None]:

df = pd.read_parquet("../../data/final_df.parquet")
df.head()

In [None]:
#apply text normalization
df['Consumer complaint narrative']= df["Consumer complaint narrative"].apply(lambda x : normalize_text(x))




In [None]:
#optional, use this to filter/cut long text (longer than 8192 tokens)
tokenizer = tiktoken.get_encoding("cl100k_base")
df['n_tokens'] = df["Consumer complaint narrative"].apply(lambda x: len(tokenizer.encode(x)))
df = df[df.n_tokens<8192]
len(df)

In [None]:
df['embedding'] = df["Consumer complaint narrative"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002'))


#### Prepare the category description : embedding map

In [None]:
#get the category descriptions (unlike normal classification, the more detail, more clear, more distinctive description the better)
# produce the embedding for each category description 
import numpy as np
issues = np.unique(df.Issue)
issue_emb = {issue:get_embedding(issue, engine = 'text-embedding-ada-002')for issue in issues}

In [None]:
issue_emb['APR or interest rate']

In [None]:
import numpy as np  
  
import numpy as np  
  
def categorize(input_vector):  
    """  
    Given an input vector and a dictionary of label vectors,  
    returns the label with the highest cosine similarity to the input vector.  
    """  
    max_similarity = float('-inf')  
    max_label = None  
      
    # Compute cosine similarity between input vector and each label vector  
    for label, vector in issue_emb.items():  
        cosine_similarity = np.dot(input_vector, vector) / (np.linalg.norm(input_vector) * np.linalg.norm(vector))  
          
        # Update max_similarity and max_label if necessary  
        if cosine_similarity > max_similarity:  
            max_similarity = cosine_similarity  
            max_label = label  
      
    return max_label  


### Prediction or matching

In [None]:
df["prediction"] = df["embedding"].apply(categorize)

In [None]:
# Count accuracy. 
df[df["prediction"] ==df["Issue"]].count()/df.shape[0]

In [None]:
df[['Consumer complaint narrative','Issue','prediction']].head()

### Approach using ChatGPT and prompt

In [None]:
#Note: The openai-python library support for Azure OpenAI is in preview.
import os
import openai
import time
openai.api_version = "2023-03-15-preview"
user_message = ""
def classify(text):
    user_message =f""" 
 classify the following input into one of the following category {issues}
 <<input>>
 {text}
 <<input>>
 The category of the input is:
"""
    i=0
    while i<10:

        try:
            response = openai.ChatCompletion.create(
                engine="gpt-35-turbo", # engine = "deployment_name".
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content":user_message },
                ]
            )
            return response['choices'][0]['message']['content']
        except:
            time.sleep(3)
            i+=1
            


In [None]:
complaints = df['Consumer complaint narrative'].values
chatgpt_predictions =[]
for complaint in complaints:
    result = classify(complaint)
    chatgpt_predictions.append(result)

In [None]:
df['chatgpt_predictions'] =chatgpt_predictions

In [None]:
df[df["chatgpt_predictions"] ==df["Issue"]].count()/df.shape[0]