In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import pipeline
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from utils import get_reviews, results
import pandas as pd

class BertFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = pipeline('feature-extraction')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: np.array):
        X = list(X)
        print(f'Extracts bert features for:{len(X)} sequences')
        features = np.array(
            self.nlp(X, pad_to_max_length=True)
        )
        print('Done extracting bert features')
        return features
    
class PooledOutput(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: np.array, y=None):
        # Size is n_datapints x n_tokens + 2 x nlp_model_dimension
        # +2 because bert adds cls in front and sep at end
        # returns first tokens embedding for each sequence
        return X[:, 0, :]
    
def make_pooled_bert():
    return Pipeline([
        ('bert_features', BertFeatures()),
        ('pooling', PooledOutput()),
        ('classifier', LogisticRegression(
            solver='saga', max_iter=5000)
        )
    ])


df = get_reviews()
#df = df.sample(100)

(
    pd.DataFrame(results(
        df,
        make_model=make_pooled_bert,
        n_data_points=[1000, 10000, 20000, len(df)]
        #n_data_points=[50, 100]
    ))
    .set_index('n_data_points')
    .plot(title='Number of data points and train set accuracy')
);

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…


Fits for number of data points:1000
Extracts bert features for:999 sequences
Done extracting bert features
Predicts and computes accuracy for the entire data set
Extracts bert features for:50000 sequences
