<a href="https://colab.research.google.com/github/nanom/textMining2021/blob/main/final_project/testing_of_final_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initializations

##### Setup

In [None]:
# --- Install libs ---
!pip install -U sentence-transformers --quiet

# --- Import libs ---
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm.notebook import tqdm
from collections import Counter
import sys

import pickle
import torch
from google_drive_downloader import GoogleDriveDownloader as gdd
from sentence_transformers import SentenceTransformer

import seaborn as snb
import matplotlib.pyplot as plt

# --- Config ---
tqdm.pandas()

##### Functions

In [None]:
# --- Main Functions ----

def predictBatchCategories(model, batch_questions):
    ### Predict categories from the questions list

    # Check GPU devices
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    # Get models
    c_model, emb_model = model
    # Get embedding og question
    batch_emb = emb_model.encode(batch_questions, show_progress_bar=True, device=device)
    
    return c_model.predict(batch_emb)

def predictQsCategory(model, question, rank=False):
    ### Predict question category

    # Get models
    c_model, emb_model = model
    # Encodeing question (embedding)
    qs_emb = emb_model.encode(question)

    if rank:
        probs = c_model.predict_proba(qs_emb.reshape(1,-1))[0]
        probs = [(c,round(p,3)) for c,p in zip(c_model.classes_, probs)]
        probs = sorted(probs, key=lambda x: x[1], reverse=True)
        for i,(cat,p) in enumerate(probs[:3]):
            print("\t{}] '{}' ({:.1f}%)".format(i+1,cat,p*100))
    
    else:
        return c_model.predict(qs_emb.reshape(1,-1))[0]


# --- Load model y samples test ---

def loadClassifierModel(model_name):
    ### Download and return classifier model

    if model_name == "m1_v1":
        url = "https://drive.google.com/file/d/1--ntaHzKjmbaWDYqt_jPtZdz48moaw-T/view?usp=sharing" 
    elif model_name == "m1_v2":
        url = "https://drive.google.com/file/d/1i7nI1arUOKTar3FrzK_LViLXx1yiMtCA/view?usp=sharing"

    elif model_name == "m2_v1":
        url = "https://drive.google.com/file/d/1-5_OM8AEkf6g1AI6CWstdXovB7tI2esX/view?usp=sharing"
    elif model_name == "m2_v2":
        url = "https://drive.google.com/file/d/1-2_ucfCIUm1xVC2wiMSVfMooj6b_2she/view?usp=sharing"

    elif model_name == "m3_v1":
        url = "https://drive.google.com/file/d/1-1-qmavRtB2gyTKS6Jbaf7q9b9_-TvI1/view?usp=sharing"
    elif model_name == "m3_v2":
        url = "https://drive.google.com/file/d/1-41pkyXCFcJo43J-BQhIHoqjDzy_wrGz/view?usp=sharing"

    elif model_name == "m4_v1":
        url = "https://drive.google.com/file/d/1-96FltzeyW__f55L4o1CxLmlL2Dch5Zl/view?usp=sharing"
    elif model_name == "m4_v2":
        url = "https://drive.google.com/file/d/1-BJJGKA1VTa2LQ43X4lbPysdSEWwPOq8/view?usp=sharing"
    else:
        sys.exit("Error!. The model '{}' no exists.".format(model_name))


    # Check GPU devices availables
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    # Download model
    dest_path = "./data/classifier_model.pkl"
    url_id = (url.split("https://drive.google.com/file/d/")[1]).split("/")[0]
    gdd.download_file_from_google_drive(file_id=url_id, dest_path=dest_path, overwrite=True)
    
    # Load model
    with open(dest_path, 'rb') as file:
        classifier_model, embedding_model = pickle.load(file)

    print("- The model has been loaded successfully...")
    print("- Predicted categories: ", *classifier_model.classes_, sep=", ")
    
    if device == 'cpu':
        print("- Note: GPU not available. The predictions process could be very slow!")
    else:
        print("- Note: All predictions will be made using GPU!")
        
    return classifier_model, embedding_model

def loadVizWizSamples():
    ### Download and retrieve data from 500 samples from the VizWiz validation subset

    dest_path = "./data/vizwiz_500_subsamples_val.pkl"
    drive_id = "1X7bRCRrrLCaUErz5AmU_sLQJcGN7p-I1"

    # Download model
    gdd.download_file_from_google_drive(file_id=drive_id, dest_path=dest_path, overwrite=True)

    # Load model
    with open(dest_path, 'rb') as file:
        df = pickle.load(file)

    print("VizWiz validation subset has been loaded successfully ... ")
    return df

def loadVizWizFull():
    ### Download and retrieve all VizWiz samples of train and validation sets

    dest_path = "./data/vizwiz_full.pkl"
    drive_id = "1_pn9qP8k4ZgeM75PNyIUyPWT23zvALPK"

    # Download model
    gdd.download_file_from_google_drive(file_id=drive_id, dest_path=dest_path, overwrite=True)

    # Load model
    with open(dest_path, 'rb') as file:
        df = pickle.load(file)

    print("VizWiz test and train samples has been loaded successfully ... ")
    return df


# --- Auxiliars ---

def printDf(df, n_samples, category=None, answer_type=None):
    ### Print dataframe in readable form
    if category:
        if category not in df.category.unique():
            sys.exit("Error!. Select one of these 'categories': {}".format(df.category.unique()))
        df = df[df.category == category]

    if answer_type:
        if answer_type not in df.answer_type.unique():
            sys.exit("Error!. Select one of these 'answer_type': {}".format(df.answer_type.unique()))
        df = df[df.answer_type == answer_type]
    
    df = df[['answer_type','category','question']]
    n_samples = min(n_samples, len(df))
    print(tabulate(df.sample(n_samples), headers='keys', tablefmt='presto', showindex='False'))

def plotCategoryDist(prediction_df):
    ### View categories distribution for each answer type
    
    df = prediction_df.groupby(['answer_type','category'])['question'].count().reset_index()
    fig, ax = plt.subplots(figsize=(10,5))
    g = snb.barplot(data=df, x='answer_type', y='question', hue='category', ax=ax)
    g.set_xlabel("Answer type")
    g.set_ylabel("Freq")
    plt.xticks(rotation=60)
    plt.title("Categories distribution (over 500 samples from the VizWiz val subset)")
    plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.show()

def plotAnswertypeDist(prediction_df):
    ### View answer type distribution for each categories predicted

    df = prediction_df.groupby(['answer_type','category'])['question'].count().reset_index()
    fig, ax = plt.subplots(figsize=(10,5))
    g = snb.barplot(data=df, x='category', y='question', hue='answer_type', ax=ax)
    g.set_xlabel("Category")
    g.set_ylabel("Freq")
    plt.xticks(rotation=60)
    plt.title("Answer type distribution (over 500 samples from the VizWiz val subset)")
    plt.legend(title='Answer type', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.show()

def plotAnswerableDist(prediction_df):
    ### View answerable questions distribution for each categories predicted

    df = prediction_df.groupby(['answerable','category'])['question'].count().reset_index()
    df['answerable'] = df.answerable.progress_apply(lambda a: 'Yes' if a==1 else 'No')
    fig, ax = plt.subplots(figsize=(10,5))
    g = snb.barplot(data=df, x='answerable', y='question', hue='category', ax=ax)
    g.set_xlabel("Answerable")
    g.set_ylabel("Freq")
    plt.xticks(rotation=60)
    plt.title("Answerable questions distribution (over 500 samples from the VizWiz val subset)")
    plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.show()

def confusionMatrix(prediction_df, save=False):
    dic = dict(Counter(zip(prediction_df.answer_type.values, prediction_df.category.values)))
    ct = prediction_df.category.unique()
    at = prediction_df.answer_type.unique()
    ct.sort(); at.sort()
    array = np.zeros((len(at),len(ct)))
    for j in range(array.shape[0]):
        for i in range(array.shape[1]):
            array[j][i] = dic.get(tuple([at[j], ct[i]]), 0.0)
    
    df = pd.DataFrame(array, index=at, columns=ct)
    df = df.div(df.sum(axis=1),axis=0) 
    fig, ax = plt.subplots(figsize=(15,5))
    g = snb.heatmap(df, annot=True, cmap='Reds', fmt='.1%', ax=ax)
    g.set_xlabel("Categories",fontsize=20)
    g.set_ylabel("Answer Type",fontsize=20)
    if save:
        plt.savefig(input("Save image as: ")+".pdf", format='pdf', bbox_inches='tight')
    plt.show()

## Make predictions 
> #### List of trained models:

* M1 = **Embedding:** [*bert_base_uncased*](https://huggingface.co/bert-base-uncased); **Classifier:** [*Logistic Regression*](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)  
* M2 = **Embedding:** *bert_base_uncased*; **Classifier:** [*Linear SVC*](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)
* M3 = **Embedding:** [*all-MiniLM-L6-v2*](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2); **Classifier:** *Logistic Regression*
* M4 = **Embedding:** *all-MiniLM-L6-v2*; **Classifier:** *Linear SVC*

> #### Class versions:

* v.1 (Initial) = **choice** -  **class** -  **color** -  **description** -  **explication** -  **main_obj** -  **ocr** -  **relative_obj** -  **yes_no**
* v.2 (Used in report) = **choice** - **color** - **explication** - **ident** - **observation** - **ocr** - **rel_ident** - **yes_no**


##### Load models

In [None]:
# Models:
#    'm1_v1' = Train Acc: 1.000000, Test Acc: 0.924912
#    'm1_v2' = Train Acc: 1.000000, Test Acc: 0.951235

#    'm2_v1' = Train Acc: 1.000000, Test Acc: 0.9240617
#    'm2_v2' = Train Acc: 1.000000, Test Acc: 0.984278

#    'm3_v1' = Train Acc: 0.915981, Test Acc: 0.86296
#    'm3_v2' = Train Acc: 0.925781, Test Acc: 0.914062

#    'm4_v1' = Train Acc: 0.957593, Test Acc: 0.857160
#    'm4_v2' = Train Acc: 0.978750, Test Acc: 0.927500

# --- Load model ---
model_name = "m2_v2"
classifier_model = loadClassifierModel(model_name)

##### Test using a sigle question

In [None]:
# Test prediction
# (Note: If rank parameter is True, return the three most probables categories, otherwise return the best)
qs1 = "What's the date of this paper?"
print("Qs:",qs1)
predictQsCategory(classifier_model, qs1, rank=True)

##### Predict in (500 VizWiz validation subset samples)

In [None]:
# Load dataset
vizwiz = loadVizWizSamples()
print("- '{}' samples loaded!".format(len(vizwiz)))

# Predict categories for each questions
results = vizwiz.assign(category=predictBatchCategories(classifier_model, vizwiz.question.to_list()))

In [None]:
# View random samples
#   * n_samples: Number of samples to view  
#   * category: Name of 'category', None to see all.
#   * answer_type = Name of 'answer_type',  None to see all.

printDf(results, n_samples=20, category=None, answer_type=None)

In [None]:
# Confusion matrix 'Answer_type' vs 'Categories' 
confusionMatrix(results)

In [None]:
# Plot categories distribution for each answer type
plotCategoryDist(results)

In [None]:
# Plot answer type distribution for each categories predicted
plotAnswertypeDist(results)

In [None]:
# Plot answerable questions distribution for each categories predicted
plotAnswerableDist(results)

##### Predict in  (Full VizWiz dataset)

In [None]:
# Load dataset
vizwiz = loadVizWizFull()
print("- '{}' samples loaded!".format(len(vizwiz)))

# Predict categories for each questions
results = vizwiz.assign(category=predictBatchCategories(classifier_model, vizwiz.question.to_list()))

In [None]:
# View random samples
#   * n_samples: Number of samples to view  
#   * category: Name of 'category', None to see all.
#   * answer_type = Name of 'answer_type',  None to see all.

printDf(results,n_samples=20, category=None, answer_type=None)

In [None]:
# Confusion matrix 'Answer_type' vs 'Categories' 
confusionMatrix(results)

In [None]:
# Plot categories distribution for each answer type
plotCategoryDist(results)

In [None]:
# Plot answer type distribution for each categories predicted
plotAnswertypeDist(results)

In [None]:
# Plot answerable questions distribution for each categories predicted
plotAnswerableDist(results)