In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer

from evaluate import load

from chroma_db import ChromaDB
from utils import split_into_sentences

In [None]:
df = pd.read_csv("../assets/cyberbullying_tweets.csv")
df.drop_duplicates(keep='first', inplace=True)
df.head()

In [None]:
df["sentences"] = df["tweet_text"].apply(split_into_sentences)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)

## ChromeDB

In [None]:
client = ChromaDB()
collection = client.get_collection("tweets_collection_v2", "cosine")

In [None]:
model_emb = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [None]:
pd.set_option('display.max_colwidth', None)
df_test[df_test.index == 19035]

In [None]:
questions_array = [
    ((17920, 0), "What the OLF Qarro group is doing?", "actively massacring ethnic christians"),
    ((19137, 0), "Who does Blumenthal love?", "Palestinian terrorists"),
    ((5755, 0), "Who has several chlorine production plants?", "Daesh"),
    ((17311, 2), "Who strongly believes homosexuality is a sin?", "Brits"),
    ((19035, 0), "Which country sells their sons to terrorists?", "Pakistan")
]

In [None]:
questions = [
    "What the OLF Qarro group is doing?",
    "Who does Blumenthal love?",
    "Who has several chlorine production plants?",
    "Who strongly believes homosexuality is a sin?",
    "Which country sells their sons to terrorists?"
]

answers = [
    "massacring ethnic christians",
    "Palestinian terrorists",
    "Daesh",
    "Brits",
    "Pakistan"
]

In [None]:
def get_documents(q):
    instruction = "Represent this sentence for searching relevant passages:"
    query = instruction + " " + q
    query = model_emb.encode(query, normalize_embeddings=True)
    results = collection.query(50, query_embeddings=[query.tolist()])
    return results["documents"][0][:5]

## Chat

In [None]:
model_name = "distilbert-base-cased-distilled-squad"
qa_model = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [None]:
bertscore = load("bertscore")

In [None]:
results = [get_documents(question) for question in questions]

In [None]:
bs_all = []
for q, a, index in zip(questions, answers, range(len(answers))):
  print()
  QA_input = {'question': q,
             'context': ' '.join(results[index])}
  res = qa_model(QA_input)
  bs = bertscore.compute(predictions=[res['answer']], references=[a], lang="en")
  bs_all.append(bs)

  print(f'Question: {q}\nAnswer: {res["answer"]}\nUser answer: {a}\nScore: {bs["f1"][0]}\n ')

In [None]:
QA_input

In [None]:
def chat_msg(question, history):
    docs = get_documents(question)
    qa_input = {
        "question": question,
        "context": " ".join(docs)
    }
    
    with torch.inference_mode():
        result = qa_model(qa_input)
        return result['answer']

In [None]:
import gradio as gr

demo = gr.ChatInterface(fn=chat_msg, examples=["hello", "hola", "merhaba"], title="Echo Bot")
demo.launch()