In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from question_classifier import QuestionClassifier
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
import pandas as pd
import pickle
import os

class QuestionClassifierWrapper:
    "Gets a question and using the probability output returns the exact places to do text retrieval from."
    def __init__(
            self, 
            main_categorization_model_dir: str = "model",
            subcategorization_model_dir: str = "subcat_models/",
            device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
        ):
        self.categorized_data = load_dataset("msaad02/categorized-data", split="train").to_pandas()
        embeddings = pickle.load(open("embeddings.pickle", "rb"))
        self.data = embeddings['data']
        self.embeddings = embeddings['embeddings']
        self.main_classifier = QuestionClassifier(model_dir=main_categorization_model_dir)
        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        self.device = device
        self.rerank_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
        self.rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large').to(self.device)
        self.rerank_model.eval()

        self.subcategory_classifiers = {}
        for subcat in os.listdir(subcategorization_model_dir):
            self.subcategory_classifiers[subcat] = QuestionClassifier(subcategorization_model_dir + subcat)

    def _predict(self, question: str, return_probabilities: bool = False):
        "Raw interface between the classifier and the user."
        prediction = {}
        if return_probabilities:
            prediction['category'], prediction['main_probs'] = self.main_classifier.predict(question, True)
        else:
            prediction['category'] = self.main_classifier.predict(question)

        category = prediction['category']
        if category in self.subcategory_classifiers:
            subcategory_classifier = self.subcategory_classifiers[category]

            if return_probabilities:
                prediction['subcategory'], sub_probs = subcategory_classifier.predict(question, True)
                prediction['sub_probs'] = {f'{category}-{subcat}': prob for subcat, prob in sub_probs.items()}
            else:
                prediction['subcategory'] = subcategory_classifier.predict(question)
        return prediction
    
    def _get_text_retrieval_places(self, question: str):
        """
        High level interface between the classifier and the user. Tells us where to do text retrieval from. Based on the probability output of the categorization models.

        It does this by returning the top categories with confidence > 0.2 of the highest probability category. (I refer to confidence as the model's probability output.)

        Returns:
            dict: {
                'main_categories': [str],
                'subcategories': [str]
            }
        """
        prediction = self._predict(question, True)

        # main category
        main_cat_probs_df = pd.DataFrame(
            [(category, prob) for category, prob in prediction['main_probs'].items()], 
            columns=['category', 'probability']
        ).sort_values(by='probability', ascending=False).reset_index(drop=True)

        # Highest category probability
        max_main_prob = main_cat_probs_df['probability'][0]

        # if max_main_prob < 0.5: use everything regardless (classifier is not confident enough)
        if max_main_prob < 0.5:
            main_categories_to_use = main_cat_probs_df['category'].tolist()
            subcategories_to_use = list(self.subcategory_classifiers.keys())
        else:
            # Use all categories at the top within 0.2 of the best category
            main_categories_to_use = main_cat_probs_df[main_cat_probs_df['probability'] > max_main_prob - 0.2]['category'].tolist()

            if 'sub_probs' in prediction.keys():
                subcategory_probs_df = pd.DataFrame(
                    [(category, prob) for category, prob in prediction['sub_probs'].items()], 
                    columns=['category', 'probability']
                ).sort_values(by='probability', ascending=False).reset_index(drop=True)

                # Highest subcategory probability
                max_sub_prob = subcategory_probs_df['probability'][0]

                # Subcategories within 0.2 of the highest subcategory
                subcategories_to_use = subcategory_probs_df[subcategory_probs_df['probability'] > max_sub_prob - 0.2]['category'].tolist()

        text_retreival_places = {
            'main_categories': main_categories_to_use,
            'subcategories': subcategories_to_use if 'sub_probs' in prediction.keys() else []
        }

        return text_retreival_places

    def retreive_text(self, question: str, top_n: int = 10):
        """
        This is the last step of retreival. The next (and final) step is to rerank the results using the reranker model.

        The output of this model is the top n results using semantic search. The results it is pulling from are the ones that are in the categories returned by the _get_text_retrieval_places function, which itself is using the probability output of the categorization models.
        """

        text_retrieval_places = self._get_text_retrieval_places(question)
        
        question_embedding = self.embedding_model.encode(question, normalize_embeddings=True)

        text_embedding_for_question = []
        raw_text_for_question = []

        for category in text_retrieval_places['main_categories']:
            if category in self.embeddings.keys():
                text_embedding_for_question.extend(self.embeddings[category])
                raw_text_for_question.extend(self.data[category])

        for subcategory in text_retrieval_places['subcategories']:
            if subcategory in self.embeddings.keys():
                text_embedding_for_question.extend(self.embeddings[subcategory])
                raw_text_for_question.extend(self.data[subcategory])

        similarity = text_embedding_for_question @ question_embedding.T
        top_args = similarity.argsort()[::-1][:top_n]

        data = pd.DataFrame(
            [(raw_text_for_question[i], similarity[i]) for i in top_args], 
            columns=['text', 'similarity']
        ).sort_values(by='similarity', ascending=False).reset_index(drop=True)

        return data
    
    def rerank(self, question: str, top_n: int = 10):
        "Pefroms reranking on the top n results of the text retreival step."
        
        matches = self.retreive_text(question, top_n)
        pairs = [[question, text] for text in matches['text'].to_list()]

        with torch.no_grad():
            inputs = self.rerank_tokenizer(
                pairs, 
                padding=True, 
                truncation=True, 
                return_tensors='pt', 
                max_length=512
            ).to(self.device)
            
            scores = self.rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()

        return scores, matches

In [2]:
classifier = QuestionClassifierWrapper()

2023-12-30 20:40:40.498379: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-30 20:40:40.520924: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
question = "tell me about the nursing program"
n_top_semantic_search_results = 50

In [12]:
classifier._get_text_retrieval_places(question)

{'main_categories': ['academics'], 'subcategories': ['academics-nursing']}

In [13]:
classifier.retreive_text(question, top_n=n_top_semantic_search_results).head(10)

Unnamed: 0,text,similarity
0,The following are criteria for eligibility for...,0.693463
1,My name is Dr. Kathleen Peterson and I’m a Pro...,0.692909
2,Questions regarding this program may be direct...,0.684233
3,"For new and prospective students, it is import...",0.68148
4,5 to keep their Direct Admit status in the nur...,0.679146
5,Reviewed 10/2/22. Our graduates will be poised...,0.67347
6,Questions related to this program may be direc...,0.673358
7,All Brockport students who intend to progress ...,0.671576
8,Students applying to and continuing in the Nur...,0.669394
9,The RN-BSN Fast Track completion program can b...,0.66893


In [14]:
rerank = classifier.rerank(question, top_n=n_top_semantic_search_results)
rerank[1].iloc[rerank[0].argmax().cpu().item()]['text']

'Please contact the nursing department administrative assistant and request an advisor as needed. Students are encouraged to meet with their advisor for mentoring and advisement regarding their nursing career as they progress through the program. Students may communicate with faculty by email or by office phone.'

In [7]:
rerank[0].argsort()

tensor([49, 16, 41, 37, 29,  9, 47, 27, 15, 11, 18, 39, 44, 35, 23, 30,  7, 31,
        22,  8, 34, 42, 20,  5, 48, 10, 26, 38, 46,  2, 25, 14, 12, 33, 28, 45,
        17, 13,  1, 32, 40,  3, 43,  4, 21, 36,  0,  6, 24, 19],
       device='cuda:0')

In [9]:
def get_answer(question: str, n_top_semantic_search_results: int = 50):
    "Returns the answer to the question."
    rerank = classifier.rerank(question, top_n=n_top_semantic_search_results)
    return rerank[1].iloc[rerank[0].argmax().cpu().item()]['text']

In [15]:
question = "Tell me about the nursing program"
get_answer(question)

'Please contact the nursing department administrative assistant and request an advisor as needed. Students are encouraged to meet with their advisor for mentoring and advisement regarding their nursing career as they progress through the program. Students may communicate with faculty by email or by office phone.'

In [17]:
similarity, ss_sim = rerank

In [21]:
similarity.cpu().numpy()

array([-3.1427033, -1.5799706, -1.830896 , -2.0069695, -3.154744 ,
       -2.1335776, -2.5212982, -3.4353988, -3.3755329, -2.7681444,
       -2.6527863, -1.1456999, -3.7866366, -2.9875748, -3.3234618,
       -3.287079 , -2.4159267, -2.8408773, -8.181602 , -3.5338557,
       -3.6986136, -1.1045159, -2.0176663, -4.813224 , -3.2054005,
       -1.9416322, -2.8401613, -3.5818055, -4.064646 , -3.4296677,
       -1.5681468, -1.5681468, -2.5591252, -4.0109396, -4.246321 ,
       -3.2925527, -5.3028083, -4.131873 , -2.5948722, -3.4582655,
       -3.8869991, -2.8953946, -2.3300664, -3.914299 , -3.084533 ,
       -2.467959 , -2.467959 , -2.8770254, -3.7362592, -4.123514 ],
      dtype=float32)

In [22]:
ss_sim['rerank_similarity'] = similarity.cpu().numpy()

In [29]:
ss_sim.sort_values(by='rerank_similarity', ascending=False).head(10)['text'].to_list()

['Please contact the nursing department administrative assistant and request an advisor as needed. Students are encouraged to meet with their advisor for mentoring and advisement regarding their nursing career as they progress through the program. Students may communicate with faculty by email or by office phone.',
 'Letter to All Prospective Nursing Students for the Traditional Bachelor of Science in Nursing (BSN) Program. (Please note this does not apply to students accepted into the RN-BSN program or the Bachelor’s to BSN: Accelerated Degree Summer Start Program). Please read this letter carefully.',
 'Our 100% online Doctorate of Nursing Program is the first doctorate level nursing program in the SUNY comprehensive sector and serves as the most advanced degree in the field of nursing.',
 'Our 100% online Doctorate of Nursing Program is the first doctorate level nursing program in the SUNY comprehensive sector and serves as the most advanced degree in the field of nursing.',
 'My na