In [1]:
import os
import pickle
import json
import timeit

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import TruncatedSVD

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

import mlflow

import torch
from transformers import RobertaModel, AutoModel, PreTrainedTokenizerFast

In [14]:
DATA_FOLDER = "data/"

X_TEST_FILE_NAME = "test_set_only_text.txt"
Y_TEST_FILE_NAME = "test_set_only_tags.txt"

with open(os.path.join(DATA_FOLDER, X_TEST_FILE_NAME), "r") as input_file:
    X_test = input_file.readlines()
    
with open(os.path.join(DATA_FOLDER, Y_TEST_FILE_NAME), "r") as input_file:
    y_test = input_file.readlines()
    
    
X_test = np.array([x.strip() for x in X_test])
y_test = np.array([int(x.strip()) for x in y_test])

In [22]:
model_dir = "roberta_base_transformers"
TOKENIZER = PreTrainedTokenizerFast(tokenizer_file=os.path.join(model_dir, "tokenizer.json"))
ENCODER: RobertaModel = AutoModel.from_pretrained(model_dir)

with open("best_models/transformer_top_best_model.pkl", "rb") as f:
    transformer_top_best_model = pickle.load(f)
    
class TransformerWrapper(object):
    
    def __init__(self, tokenizer, encoder, model):
        
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.model = model
    
    def predict(self, X):
        
        X_tokenized = [self.tokenizer.encode(x) for x in np.array(X)]
        X_encoded = np.array([MODEL(torch.tensor([x]))[0][0][1].detach().numpy() for x in X_tokenized])
        predictions = self.model.predict(X_encoded)
        
        return predictions 
    
transformer_encoder_based_best_model = TransformerWrapper(TOKENIZER, ENCODER, transformer_top_best_model)

Some weights of the model checkpoint at roberta_base_transformers were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta_base_transformers and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be ab

In [20]:
with open("best_models/classic_ml_best_model.pkl", "rb") as f:
    classic_ml_best_model = pickle.load(f)

In [37]:
results = {"name":[], "score":[], "prediction_speed":[]}

for name, model in zip(["classic_ml", "transformer_encoder_based"], 
                       [classic_ml_best_model, transformer_encoder_based_best_model]):
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average="micro")
    results["name"].append(name)
    results["score"].append(score)
    results["prediction_speed"].append(timeit.timeit(lambda: model.predict(X_test), number=1)/X_test.shape[0])

In [38]:
pd.DataFrame(results)

Unnamed: 0,name,score,prediction_speed
0,classic_ml,0.879,1.1e-05
1,transformer_encoder_based,0.865,0.029091
