# Sentiment analysis

In [20]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

import json

# Load the data from JSON file
with open("sentiment_analysis.json", "r") as f:
    data = json.load(f)

# Define the training data
train_data = []
sub_labels = {}
for top_level_label, sub_level_labels in data.items():
    for sub_level_label_data in sub_level_labels:
        text = sub_level_label_data["text"]
        sub_level_label = sub_level_label_data["sub_level_label"]
        train_data.append((text, top_level_label, sub_level_label))
        if top_level_label not in sub_labels:
            sub_labels[top_level_label] = []
        sub_labels[top_level_label].append(sub_level_label)

# Define the mapping between top-level labels and integers
top_level_label_map = {label: i for i, label in enumerate(set([data[1] for data in train_data]))}

# Define the mapping between sub-level labels and integers
sub_level_label_map = {sub_label: i for i, sub_label in enumerate(set([sub_label for sub_labels_list in sub_labels.values() for sub_label in sub_labels_list]))}

# Convert the training data labels to integers using the label_map and sub_label_map
# A tensor is a multi-dimensional array that looks like a numpy array, it's used for neural networks
top_level_labels = torch.tensor([top_level_label_map[data[1]] for data in train_data])
sub_level_labels = torch.tensor([sub_level_label_map[sub_label] for data in train_data for sub_label in sub_labels[data[1]]])

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(top_level_label_map))
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the training data and convert to tensors
inputs = tokenizer.batch_encode_plus([data[0] for data in train_data], padding=True, truncation=True, return_tensors="pt")

# Fine-tune the model on the training data
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()


print(train_data)
print(sub_labels)


for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=top_level_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Evaluate the model on the training data
    predictions = outputs.logits.argmax(axis=1)
    accuracy = (predictions == top_level_labels).sum()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[('Can you summarize the privacy policy for me?', 'privacy_policy', 'summarization'), ('What information does the privacy policy collect?', 'privacy_policy', 'question_answering'), ('How can I opt out of data sharing?', 'privacy_policy', 'question_answering'), ('Is my personal information shared with third parties?', 'privacy_policy', 'question_answering'), ('What information does the privacy policy collect?', 'privacy_policy', 'collection'), ('How does the company use my personal information?', 'privacy_policy', 'usage'), ('Is my personal information shared with third parties?', 'privacy_policy', 'sharing'), ('How is my personal information stored and secured?', 'privacy_policy', 'storage_security'), ('What are my rights regarding my personal information?', 'privacy_policy', 'rights'), ('How can I request access to my personal information?', 'privacy_policy', 'access'), ("Can I delete my personal information from the company's records?", 'privacy_policy', 'deletion'), ("What are the c

## Sentiment analysis query function

In [29]:
# Test the model

def predict_intent(text):
    inputs = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])

    top_level_predicted_label = torch.argmax(outputs.logits).item()
    top_level_predicted_intent = [k for k, v in top_level_label_map.items() if v == top_level_predicted_label]

    sub_level_predicted_label = torch.argmax(outputs.logits).item()
    # sub_level_predicted_intents = [k for k, v in sub_level_label_map.items() if v in sub_level_predicted_labels]
    sub_level_predicted_intent = [k for k, v in sub_labels[top_level_predicted_intent].items() if v == sub_level_predicted_label]
    

    # top_level_inputs = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")
    # top_level_outputs = model(top_level_inputs["input_ids"], attention_mask=top_level_inputs["attention_mask"])
    # # top_level_predicted_labels = torch.argsort(top_level_outputs.logits, descending=True).tolist()[0]
    # top_level_predicted_label = torch.argmax(top_level_outputs.logits).item()
    # top_level_predicted_intent = [k for k, v in top_level_label_map.items() if v == top_level_predicted_label]
    
    # sub_level_inputs = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")
    # sub_level_outputs = model(sub_level_inputs["input_ids"], attention_mask=sub_level_inputs["attention_mask"])
    # sub_level_predicted_label = torch.argmax(sub_level_outputs.logits).item()
    # # sub_level_predicted_intents = [k for k, v in sub_level_label_map.items() if v in sub_level_predicted_labels]
    # sub_level_predicted_intent = [k for k, v in sub_level_label_map.items() if v == sub_level_predicted_label]
    
    print('top level:', top_level_predicted_label)
    print(top_level_label_map)
    print('sub level:', sub_level_predicted_label)
    print(sub_level_label_map)
    return top_level_predicted_intent[0] if top_level_predicted_intent else None, sub_level_predicted_intent[0] if sub_level_predicted_intent else None


top_level_intents, sub_level_intents = predict_intent("How do I reset my password?")
print(top_level_intents, sub_level_intents)
# top_level_intents, sub_level_intents = predict_intent("Can you summarize the liability section of the terms of service?")
# print(top_level_intents, sub_level_intents)


TypeError: unhashable type: 'list'

## GPT

In [22]:
# GPT model herel 
%load_ext autoreload
%autoreload 2

from gpt import GPT

gpt_model = GPT()

#gpt_model.answer_question(question='What is the most important thing I need to know about your privacy statement?')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# GPT query function here


In [24]:
import gradio as gr
import time
import random

# load the pre-trained intent analysis model
# nlp = spacy.load("en_trf_bertbaseuncased_lg")

response_map = {
    ("security", "security_relating_to"): ["Our security measures include...", "We take security very seriously and have implemented..."],
    ("security", "security_concerns"): ["We understand your security concerns and have taken steps to address them.", "You can trust that your information is safe with us."],
    ("information", "information_about"): ["Our store offers a variety of products, including...", "We also have a rewards program that allows you to earn points on your purchases."],
    ("information", "information_schedule"): ["We are open from 9am to 10pm, 7 days a week.", "Our business hours are 9am to 5pm, Monday to Friday."],
    ("help", "help_with_finding"): ["Here are some hotels near the airport:...", "I can help you find a hotel that meets your needs."],
    ("help", "help_with_booking"): ["You can book a room on our website or by calling our reservation hotline.", "We also offer a loyalty program that gives you discounts on future bookings."],
    ("information", "ordering"): ["You can place an order on our website or by calling our order hotline.", "We also offer a loyalty program that gives you discounts on future orders."]
}

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        user_message = history[-1][0]

        intent = predict_intent(text=user_message)
        print('intent:', intent)
        # Random choice randomly chooses one of the options that matches the intent
     
        # generating a response with GPT if the main intent was 'privacy_policy' or 'legal_statement'
        use_gpt = intent[0] == 'privacy_policy' or intent[0] == 'legal_statement'

        response = gpt_model.answer_question(question=user_message) if use_gpt else 'No idea, bitch'

        # response = random.choice(response_map[intent])
        history[-1][1] = response
        # The sleep is to simulate a more natural conversation
        time.sleep(1)
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


