# AVG Model

In this notebook, we perform our first exploration of the combination of different models previously trained


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scripts import tree_utils
from sklearn.metrics import top_k_accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

%load_ext autoreload
%autoreload 2

  from pandas import MultiIndex, Int64Index
[nltk_data] Downloading package stopwords to /home/app/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 0. Import Pre-Trained Models (BL0 and BL1)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from model.text_normalizer import normalize_corpus, stopword_list

In [3]:
#opening
#model = joblib.load('model_name')
model_title = joblib.load('/home/app/src/model/model_BL0')
vect_title  = joblib.load('/home/app/src/model/vect_BL0')

model_title_desc = joblib.load('/home/app/src/model/model_BL1')
vect_title_desc = joblib.load('/home/app/src/model/vect_BL1')


In [4]:
def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

In [5]:
name_sample = "Casio - Portable Keyboard with 61 Touch-Sensitive Keys - Black/Silver "
descr_sample = "CASIO Portable Keyboard with 61 Touch-Sensitive Keys: MIDI and USB connectivity; 600 AHL keyboard voices; 180 rhythms; 152 songs; auto accompaniment"
name_descr_sample = name_sample + descr_sample
true_label_sample = 'Keyboards'

In [6]:
name_sample_n = normalize_corpus([name_sample])
name_sample_v= vect_title.transform(name_sample_n)

In [7]:
name_descr_sample_n = normalize_corpus([name_descr_sample])
name_descr_sample_v= vect_title_desc.transform(name_descr_sample_n)

## 1. Create combined model to get predictions

In [8]:
class Combined_Model():
   

    def predict_proba(self, X_list, estimators):
        """
        Predict probabilities of classes for samples in X.
        Parameters
        ----------
        X : list of array_like or sparse matrix of shape (n_samples, n_features)
            [X_name, X_description, X_image]
        estimators: list of pretrained models to be combined in the following order [name_model, name_description_model, image_model]
        Returns
        -------
        C : array, shape [n_samples]
            Predicted class label per sample.
        """
        # only NLP Models 
        if len(estimators) == 2:
            y_pred_model_1 = estimators[0].predict_proba(X_list[0])
            y_pred_model_2 = estimators[1].predict_proba(X_list[1])
            probs = np.array([(prob1 + prob2) * 0.5 for prob1, prob2 in zip(y_pred_model_1, y_pred_model_2)])
        
        # NLP + images    
        elif len(estimators) == 3:
            y_pred_model_1 = estimators[0].predict_proba(X_list[0])
            y_pred_model_2 = estimators[1].predict_proba(X_list[1])
            y_pred_model_3 = estimators[2].predict_proba(X_list[2])
            probs = np.array([(prob1 + prob2+ prob3) * (1/len(estimators)) for prob1, prob2, prob3 in zip(y_pred_model_1, y_pred_model_2, y_pred_model_3)])
        
        return probs

    def predict_best_five(self, X_list, estimators, max_k_feat):
        """
        Selects the k classes with highest probability for samples in X_list obtained from predict_proba() method .
        
        Parameters 
        ----------
        X_list : list of array_like or sparse matrix of shape (n_samples, n_features)
            [X_name, X_description, X_image] to pass to predict_proba()
        estimators: list of pretrained models to be combined in the following order [name_model, name_description_model, image_model]
        Returns

        estimators : list List of models to be combined

        max_k_feat : int number of classes
        
        Return
        -------
        dict_max_feat: python dict dictionary with classes with highest probability
            
        """
        
        probs = self.predict_proba(X_list, estimators)

        cat_prob = probs
        classes = estimators[0].classes_

        most_prob_cat_idx = np.argsort(-cat_prob[0])[:max_k_feat]
        name_cat_max= []
    
        for idx in most_prob_cat_idx:
            nm_cat = classes[idx]
            name_cat_max.append(nm_cat)

        dict_max_feat = {}
        for items in range(len(name_cat_max)):
          dict_max_feat[items] = np.array_str(decoder(name_cat_max[items]))

        return dict_max_feat 

In [9]:
final_model = Combined_Model()

In [10]:
prediction = final_model.predict_best_five(X_list=[name_sample_v, name_descr_sample_v], 
                                           estimators=[model_title, model_title_desc], 
                                           max_k_feat=5)

In [11]:
prediction

{0: 'Keyboards',
 1: 'Computer Keyboards',
 2: 'Musical Instrument Accessories',
 3: 'other',
 4: 'iPad & Tablet Accessories'}