In [1]:
import google.auth

In [2]:
import sys
import numpy as np

import scipy
from scipy.sparse import hstack
import pandas as pd
from tqdm import tqdm
import re

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

In [3]:
from pecos.xmc.xlinear.model import XLinearModel
from pecos.xmc import Indexer, LabelEmbeddingFactory
from pecos.utils import smat_util

from pecos.utils import logging_util
logging_util.setup_logging_config(level=1) # LOGGING CONFIG

In [4]:
import nltk
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Tf-idf vectorizer class

In [5]:
stemmer = SnowballStemmer(language='english')

simple_tokenizer = lambda t: re.findall('\w+', t)

# Replace all punctuation with spaces for 
# character-level tf-idf
def character_preprocessor(text):
    clear_text = re.sub('\W+', ' ', text)
    if clear_text[-1] != ' ':
        clear_text = clear_text + ' '
    if clear_text[0] != ' ':
        clear_text = ' ' + clear_text
    return clear_text


# Tf-idf vectorizer with:
#   - unigram word tf-idf
#   - bigram word tf-idf
#   - thrigram character tf-idf
# Concatecated together
class MultipleVectorizer:
    
    def __init__(self,
                 tokenizer,
                 lowercase=True,
                 stop_words=None,
                 min_df=1,
                 max_df=1.0,
                 max_features=None,
                 do_stemm=False):

        # Create vectorizers
        word_unigram_tfidf_vect = TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 1),
            tokenizer=tokenizer,
            lowercase=lowercase,
            min_df=min_df,
            max_df=max_df,
            max_features=max_features
        )

        word_bigram_tfidf_vect = TfidfVectorizer(
            analyzer='word',
            ngram_range=(2, 2),
            tokenizer=tokenizer,
            lowercase=lowercase,
            min_df=min_df,
            max_df=max_df,
            max_features=max_features
        )

        char_trigram_tfidf_vect = TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=(3, 3),
            lowercase=lowercase,
            preprocessor=character_preprocessor,
            min_df=min_df,
            max_df=max_df,
            max_features=max_features
        )
        
        self.vectorizers = [word_unigram_tfidf_vect, 
                            word_bigram_tfidf_vect, 
                            char_trigram_tfidf_vect]
        
        self.do_stemm = do_stemm
        self.tokenizer = tokenizer

    

    
    def simple_stemmer(self, texts_list):
        result = []
        print('Stemming:')
        for text in tqdm(texts_list):
            words=self.tokenizer(text)
            words=[stemmer.stem(word) for word in words]
            stemmed_text = ' '.join(words)
            result.append(stemmed_text)
        return result


    def fit(self, X):

        if self.do_stemm:
            X = self.simple_stemmer(X)

        print("Fitting vectorizers:")
        for vectorizer in tqdm(self.vectorizers):
            vectorizer.fit(X)
        
    def transform(self, X):
        
        if self.do_stemm:
            X = self.simple_stemmer(X)

        print('Transforming:')
        output = []
        for vectorizer in tqdm(self.vectorizers):
            output.append(vectorizer.transform(X))
            
        stack = hstack(output)
        csr_matrix = scipy.sparse.csr_matrix(stack, dtype=np.float32)

        return csr_matrix
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)
    

    def get_feature_names_out(self):
        output = []
        for vectorizer in self.vectorizers:
            output.append(vectorizer.get_feature_names_out())
        return np.hstack(output)

In [6]:
# Test tf-idf vectorizer

# Create vectorizer from multiple vectorizers
multiple_vect = MultipleVectorizer(
  lowercase=True,
  min_df=1,
  max_df=1.0,
  max_features=None,
  tokenizer=simple_tokenizer,
  do_stemm=False
)

X = ['''ab,    abb ab!!... ba baba 
aba aba abab! bab bab
  abab ab ab ab ab ab abb b bab ab ab ab ab ab''',
     'ab ab, abab ab, ab: abab ba baba baba ab aba. bab aba',
     'a a bab ab   abab ab ab abab, ab ab ab ab abab ab aba b',
     'ab, ab ab ab, ab ab ab   ba bab ab a, bab ab! ab ab ab a ba']
result = multiple_vect.fit_transform(X)
multiple_vect.get_feature_names_out()

Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:00<00:00, 606.08it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 854.06it/s]


array(['a', 'ab', 'aba', 'abab', 'abb', 'b', 'ba', 'bab', 'baba', 'a a',
       'a ba', 'a bab', 'ab a', 'ab ab', 'ab aba', 'ab abab', 'ab abb',
       'ab ba', 'aba aba', 'aba abab', 'aba b', 'aba bab', 'abab ab',
       'abab ba', 'abab bab', 'abb ab', 'abb b', 'b bab', 'ba bab',
       'ba baba', 'bab ab', 'bab aba', 'bab abab', 'bab bab', 'baba ab',
       'baba aba', 'baba baba', ' a ', ' ab', ' b ', ' ba', 'ab ', 'aba',
       'abb', 'ba ', 'bab', 'bb '], dtype=object)

### Load dataset

In [7]:
dataset_filename = None
try:
    credentials, project = google.auth.default()
    dataset_filename = dataset_file_name # GCP
except:
    if 'google.colab' in sys.modules:
        dataset_filename = '/content/drive/MyDrive/qa_ncbi_dataset.tsv' # google colab
    else:
        dataset_filename = 'data/qa_ncbi_dataset.tsv' # local run
        
df = pd.read_csv(dataset_filename, sep='\t')
df.shape

(29319, 15)

In [8]:
df.question_id.nunique()

4040

### Split into dataframes

In [9]:
# Paper DF # Z
papers_df = df[['snippet_url', 'title', 'abstract']] \
    .drop_duplicates() \
    .reset_index(drop=True).reset_index() \
    .rename(columns={'index': 'paper_id'})
papers_df['title_abstract'] = papers_df['title'] + '. ' + papers_df['abstract']

df = df.merge(papers_df[['paper_id', 'snippet_url']])
papers_df = papers_df.set_index('paper_id')

# # Create paper id from url
# df['paper_id'] = df.snippet_id.str.split('_')
# df['paper_id'] = df['paper_id'].apply(lambda a: a[0]).astype(int)
papers_df.sample()

Unnamed: 0_level_0,snippet_url,title,abstract,title_abstract
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19942,http://www.ncbi.nlm.nih.gov/pubmed/25218447,Uncovering global SUMOylation signaling networ...,SUMOylation is a reversible post-translational...,Uncovering global SUMOylation signaling networ...


In [10]:
# question-paper matrix # Y
question_paper_df = pd.crosstab(df['question_id'], df['paper_id']).astype(int)
question_paper_df = question_paper_df[sorted(question_paper_df.columns)]
question_paper_df.sample()

paper_id,0,1,2,3,4,5,6,7,8,9,...,27794,27795,27796,27797,27798,27799,27800,27801,27802,27803
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533aaab6d6d3ac6a34000062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Question df # X
questions_df = df[['question_id', 'question_text']] \
    .drop_duplicates('question_id') \
    .set_index('question_id').loc[question_paper_df.index]
questions_df.sample()

Unnamed: 0_level_0,question_text
question_id,Unnamed: 1_level_1
56c71cb65795f9a73e00000b,Can DNA intercalators function as topoisomeras...


### Split into dataframes [with splitting into sentences] (not finished)

### Train-test splitting

In [12]:
train_question_ids, valid_question_ids = train_test_split(questions_df.index, 
                                                          test_size=0.1,
                                                          random_state=5)
print(f'Train: {len(train_question_ids)}')
print(f'Valid: {len(valid_question_ids)}')

Train: 3636
Valid: 404


In [13]:
# Questions splitting
train_questions_df = questions_df.loc[train_question_ids]
valid_questions_df = questions_df.loc[valid_question_ids]

# Question-Paper matrix splitting
train_question_paper_df = question_paper_df.loc[train_question_ids]
train_question_paper_csr = scipy.sparse.csr_matrix(train_question_paper_df.values, dtype=np.float32)
valid_question_paper_df = question_paper_df.loc[valid_question_ids]
valid_question_paper_csr = scipy.sparse.csr_matrix(valid_question_paper_df.values, dtype=np.float32)

### Tf-idf vectorization

In [14]:
paper_column = 'title_abstract' # title_abstract / title / abstract

# Concatenate all texts
all_texts = np.concatenate([papers_df[paper_column].values, 
                            train_questions_df.question_text.values])

vectorizer = MultipleVectorizer(
    tokenizer=simple_tokenizer,
    lowercase=True,
    min_df=1,
    max_df=1.0,
    do_stemm=False,
    max_features=300_000
)

vectorizer.fit(all_texts)
print(f'Vector size: {len(vectorizer.get_feature_names_out())}')

Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:43<00:00, 14.49s/it]


Vector size: 434546


In [15]:
train_questions_csr = vectorizer.transform(train_questions_df.question_text.values)
valid_questions_csr = vectorizer.transform(valid_questions_df.question_text.values)
papers_csr = vectorizer.transform(papers_df[paper_column].values)

Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.43it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 79.24it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.63s/it]


### Hyperparameter tuning with Bayesian Optimization

In [16]:
from bayes_opt import BayesianOptimization

In [17]:
paper_column = 'title_abstract' # title_abstract / title / abstract

# Concatenate all texts
all_texts = np.concatenate([papers_df[paper_column].values, 
                            train_questions_df.question_text.values])

In [18]:
def black_box_function(min_df,
                       max_df,
                       max_features,
                       threshold, 
                       nr_splits, 
                       Cp, 
                       Cn, 
                       max_iter,
                       eps,
                       bias):
    
    max_features = int(max_features)
    min_df = int(min_df)
    nr_splits = int(nr_splits)
    max_iter = int(max_iter)
    
    # Vectorization
    vectorizer = MultipleVectorizer(
        tokenizer=simple_tokenizer,
        lowercase=True,
        min_df=min_df,
        max_df=max_df,
        do_stemm=False,
        max_features=max_features
    )

    vectorizer.fit(all_texts)
    print(f'Vector size: {len(vectorizer.get_feature_names_out())}')
    train_questions_csr = vectorizer.transform(train_questions_df.question_text.values)
    valid_questions_csr = vectorizer.transform(valid_questions_df.question_text.values)
    papers_csr = vectorizer.transform(papers_df[paper_column].values)
    
    # Train
    hlm_args_dict = {
        'neg_mining_chain': "tfn",
        'model_chain': {
            'threshold': threshold,
            'max_nonzeros_per_label': None,
            'solver_type': "L2R_L2LOSS_SVC_DUAL",
            'Cp': Cp,
            'Cn': Cn,
            'max_iter': max_iter,
            'eps': eps,
            'bias': bias,
            'threads': 8,
            'verbose': 0,
        }
    }
    
    train_params_dict = {
        'mode': "full-model",
        'ranker_level': 1,
        'nr_splits': nr_splits,
        'min_codes': None,
        'shallow': False,
        'rel_mode': "disable",
        'rel_norm': "no-norm",
        'hlm_args': hlm_args_dict
    }
    
    predict_params_dict = {'hlm_args': hlm_args_dict}
    
    train_params = XLinearModel.TrainParams.from_dict(train_params_dict, recursive=True)
    pred_params = XLinearModel.PredParams.from_dict(predict_params_dict, recursive=True)

    pifa_concat = LabelEmbeddingFactory.create(train_question_paper_csr, 
                                           train_questions_csr, 
                                           Z=papers_csr, 
                                           method="pifa_lf_concat")

    cluster_chain = Indexer.gen(pifa_concat, 
                                indexer_type="hierarchicalkmeans",
                                nr_splits=nr_splits)
    
    xlm = XLinearModel.train(train_questions_csr, 
                             train_question_paper_csr, 
                             C=cluster_chain,
                             train_params=train_params)
    
    predicted_question_paper_csr = xlm.predict(valid_questions_csr,
                                               beam_size=10,
                                               only_topk=10)
    
    metric = smat_util.Metrics.generate(valid_question_paper_csr, 
                                        predicted_question_paper_csr, 
                                        topk=10)    
    prec = metric.prec[0] # Take first precision
    
    return prec

In [19]:
pbounds = {
    'min_df':(1, 10),
    'max_df': (0, 1),
    'max_features': (10_000, 500_000),
    'threshold': (0.1, 1),
    'nr_splits': (2, 1024),
    'Cp': (0, 1),
    'Cn': (0, 1),
    'max_iter': (5, 500),
    'eps': (0, 1),
    'bias': (0, 0.5)
}

optimizer = BayesianOptimization(f=black_box_function,
                                 pbounds=pbounds, 
                                 verbose=2,
                                 random_state=5)

In [None]:
optimizer.maximize(init_points = 10, n_iter = 100)
print("Best result: {}; f(x) = {}.".format(optimizer.max["params"], optimizer.max["target"]))

|   iter    |  target   |    Cn     |    Cp     |   bias    |    eps    |  max_df   | max_fe... | max_iter  |  min_df   | nr_splits | threshold |
-------------------------------------------------------------------------------------------------------------------------------------------------
Fitting vectorizers:


100%|██████████| 3/3 [00:51<00:00, 17.14s/it]


Vector size: 189128
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.95it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 83.78it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.70s/it]


| [0m 1       [0m | [0m 0.01733 [0m | [0m 0.222   [0m | [0m 0.8707  [0m | [0m 0.1034  [0m | [0m 0.9186  [0m | [0m 0.4884  [0m | [0m 3.098e+0[0m | [0m 384.1   [0m | [0m 5.666   [0m | [0m 305.3   [0m | [0m 0.2689  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:52<00:00, 17.41s/it]


Vector size: 203932
Transforming:


100%|██████████| 3/3 [00:00<00:00,  9.93it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 71.41it/s]


Transforming:


100%|██████████| 3/3 [00:42<00:00, 14.01s/it]


| [0m 2       [0m | [0m 0.0     [0m | [0m 0.08074 [0m | [0m 0.7384  [0m | [0m 0.2207  [0m | [0m 0.1583  [0m | [0m 0.8799  [0m | [0m 1.443e+0[0m | [0m 210.0   [0m | [0m 3.665   [0m | [0m 644.6   [0m | [0m 0.6219  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:51<00:00, 17.26s/it]


Vector size: 110231
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.92it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 72.68it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.37s/it]


| [95m 3       [0m | [95m 0.02723 [0m | [95m 0.5999  [0m | [95m 0.2658  [0m | [95m 0.1423  [0m | [95m 0.2536  [0m | [95m 0.3276  [0m | [95m 8.064e+0[0m | [95m 86.98   [0m | [95m 9.675   [0m | [95m 983.4   [0m | [95m 0.2696  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:50<00:00, 16.85s/it]


Vector size: 185552
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.99it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 87.03it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.41s/it]


| [0m 4       [0m | [0m 0.0     [0m | [0m 0.02431 [0m | [0m 0.2046  [0m | [0m 0.3499  [0m | [0m 0.7795  [0m | [0m 0.02293 [0m | [0m 2.931e+0[0m | [0m 5.813   [0m | [0m 5.639   [0m | [0m 655.9   [0m | [0m 0.9871  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:52<00:00, 17.55s/it]


Vector size: 216127
Transforming:


100%|██████████| 3/3 [00:00<00:00, 13.05it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 95.31it/s]


Transforming:


100%|██████████| 3/3 [00:35<00:00, 11.94s/it]


| [0m 5       [0m | [0m 0.02228 [0m | [0m 0.2591  [0m | [0m 0.8025  [0m | [0m 0.4352  [0m | [0m 0.9227  [0m | [0m 0.002214[0m | [0m 2.4e+05 [0m | [0m 490.8   [0m | [0m 4.591   [0m | [0m 833.6   [0m | [0m 0.5918  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:52<00:00, 17.43s/it]


Vector size: 158226
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.29it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 81.21it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.96s/it]


| [0m 6       [0m | [0m 0.002475[0m | [0m 0.7709  [0m | [0m 0.4849  [0m | [0m 0.01456 [0m | [0m 0.08653 [0m | [0m 0.1115  [0m | [0m 1.331e+0[0m | [0m 482.6   [0m | [0m 6.686   [0m | [0m 836.6   [0m | [0m 0.6095  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:50<00:00, 16.95s/it]


Vector size: 157179
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.11it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.60it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.68s/it]


| [0m 7       [0m | [0m 0.002475[0m | [0m 0.6354  [0m | [0m 0.8119  [0m | [0m 0.4633  [0m | [0m 0.9126  [0m | [0m 0.8248  [0m | [0m 5.616e+0[0m | [0m 183.7   [0m | [0m 1.32    [0m | [0m 560.4   [0m | [0m 0.8165  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:51<00:00, 17.09s/it]


Vector size: 189294
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.48it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 81.08it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.82s/it]


| [0m 8       [0m | [0m 0.002475[0m | [0m 0.05114 [0m | [0m 0.1887  [0m | [0m 0.1827  [0m | [0m 0.2443  [0m | [0m 0.7951  [0m | [0m 1.825e+0[0m | [0m 321.2   [0m | [0m 5.441   [0m | [0m 598.3   [0m | [0m 0.9454  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:51<00:00, 17.19s/it]


Vector size: 109208
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.67it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.42it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.62s/it]


| [95m 9       [0m | [95m 0.05941 [0m | [95m 0.9435  [0m | [95m 0.1117  [0m | [95m 0.4218  [0m | [95m 0.346   [0m | [95m 0.1008  [0m | [95m 1.979e+0[0m | [95m 257.6   [0m | [95m 9.65    [0m | [95m 381.7   [0m | [95m 0.1111  [0m |
Fitting vectorizers:


100%|██████████| 3/3 [00:51<00:00, 17.23s/it]


Vector size: 363198
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.14it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 71.77it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.90s/it]


| [0m 10      [0m | [0m 0.002475[0m | [0m 0.8597  [0m | [0m 0.1111  [0m | [0m 0.2392  [0m | [0m 0.85    [0m | [0m 0.5147  [0m | [0m 2.288e+0[0m | [0m 401.2   [0m | [0m 1.184   [0m | [0m 587.2   [0m | [0m 0.4702  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.46s/it]


Vector size: 258254
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.71it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 88.45it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.83s/it]


| [0m 11      [0m | [0m 0.00495 [0m | [0m 0.9307  [0m | [0m 0.8253  [0m | [0m 0.1876  [0m | [0m 0.5961  [0m | [0m 0.105   [0m | [0m 2e+05   [0m | [0m 431.2   [0m | [0m 3.097   [0m | [0m 531.2   [0m | [0m 0.6247  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.77s/it]


Vector size: 317212
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.78it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.86it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.17s/it]


| [0m 12      [0m | [0m 0.02228 [0m | [0m 0.9036  [0m | [0m 0.8835  [0m | [0m 0.3999  [0m | [0m 0.2919  [0m | [0m 0.2191  [0m | [0m 4.998e+0[0m | [0m 34.15   [0m | [0m 3.219   [0m | [0m 751.0   [0m | [0m 0.3634  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.52s/it]


Vector size: 138388
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.81it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.85it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.41s/it]


| [0m 13      [0m | [0m 0.0     [0m | [0m 0.7812  [0m | [0m 0.5141  [0m | [0m 0.463   [0m | [0m 0.4526  [0m | [0m 0.7233  [0m | [0m 1.055e+0[0m | [0m 410.9   [0m | [0m 7.779   [0m | [0m 412.3   [0m | [0m 0.6288  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.96s/it]


Vector size: 488158
Transforming:


100%|██████████| 3/3 [00:00<00:00,  9.50it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 70.39it/s]


Transforming:


100%|██████████| 3/3 [00:43<00:00, 14.42s/it]


| [0m 14      [0m | [0m 0.009901[0m | [0m 0.194   [0m | [0m 0.6897  [0m | [0m 0.03151 [0m | [0m 0.1883  [0m | [0m 0.7286  [0m | [0m 4.125e+0[0m | [0m 106.2   [0m | [0m 2.211   [0m | [0m 831.8   [0m | [0m 0.3967  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.94s/it]


Vector size: 138182
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.74it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 82.42it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.45s/it]


| [0m 15      [0m | [0m 0.05198 [0m | [0m 0.1123  [0m | [0m 0.4253  [0m | [0m 0.3941  [0m | [0m 0.363   [0m | [0m 0.4275  [0m | [0m 3.137e+0[0m | [0m 159.3   [0m | [0m 7.075   [0m | [0m 764.9   [0m | [0m 0.152   [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.99s/it]


Vector size: 159549
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.26it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.52it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.54s/it]


| [0m 16      [0m | [0m 0.007426[0m | [0m 0.8647  [0m | [0m 0.635   [0m | [0m 0.02159 [0m | [0m 0.05    [0m | [0m 0.8914  [0m | [0m 4.14e+05[0m | [0m 51.33   [0m | [0m 6.378   [0m | [0m 927.6   [0m | [0m 0.5692  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.56s/it]


Vector size: 189328
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.17it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 79.03it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.67s/it]


| [0m 17      [0m | [0m 0.00495 [0m | [0m 0.219   [0m | [0m 0.6041  [0m | [0m 0.2122  [0m | [0m 0.391   [0m | [0m 0.9039  [0m | [0m 1.979e+0[0m | [0m 446.2   [0m | [0m 5.388   [0m | [0m 473.8   [0m | [0m 0.4846  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.55s/it]


Vector size: 413827
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.95it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 74.21it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.36s/it]


| [0m 18      [0m | [0m 0.0     [0m | [0m 0.9777  [0m | [0m 0.3306  [0m | [0m 0.3274  [0m | [0m 0.4235  [0m | [0m 0.3183  [0m | [0m 3.386e+0[0m | [0m 204.3   [0m | [0m 2.897   [0m | [0m 725.6   [0m | [0m 0.8106  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.65s/it]


Vector size: 120465
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.65it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.01it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.47s/it]


| [0m 19      [0m | [0m 0.01238 [0m | [0m 0.9495  [0m | [0m 0.8253  [0m | [0m 0.127   [0m | [0m 0.01165 [0m | [0m 0.6251  [0m | [0m 8.204e+0[0m | [0m 208.6   [0m | [0m 7.155   [0m | [0m 32.89   [0m | [0m 0.4197  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.74s/it]


Vector size: 166294
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.32it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 70.84it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.36s/it]


| [0m 20      [0m | [0m 0.0     [0m | [0m 0.4553  [0m | [0m 0.9105  [0m | [0m 0.281   [0m | [0m 0.9788  [0m | [0m 0.4408  [0m | [0m 9.083e+0[0m | [0m 417.3   [0m | [0m 2.79    [0m | [0m 492.9   [0m | [0m 0.9828  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.90s/it]


Vector size: 235700
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.25it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 76.48it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.77s/it]


| [0m 21      [0m | [0m 0.02723 [0m | [0m 0.5686  [0m | [0m 0.7096  [0m | [0m 0.4922  [0m | [0m 0.877   [0m | [0m 0.6659  [0m | [0m 1.891e+0[0m | [0m 495.7   [0m | [0m 4.975   [0m | [0m 177.5   [0m | [0m 0.3097  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 17.00s/it]


Vector size: 317352
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.94it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.37it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.39s/it]


| [0m 22      [0m | [0m 0.009901[0m | [0m 0.1708  [0m | [0m 0.9102  [0m | [0m 0.02456 [0m | [0m 0.2075  [0m | [0m 0.2601  [0m | [0m 3.843e+0[0m | [0m 329.5   [0m | [0m 3.123   [0m | [0m 737.4   [0m | [0m 0.4909  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.87s/it]


Vector size: 317730
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.82it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.84it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.58s/it]


| [0m 23      [0m | [0m 0.0     [0m | [0m 0.655   [0m | [0m 0.9383  [0m | [0m 0.1345  [0m | [0m 0.7522  [0m | [0m 0.4754  [0m | [0m 4.759e+0[0m | [0m 245.0   [0m | [0m 3.743   [0m | [0m 916.7   [0m | [0m 0.5791  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.59s/it]


Vector size: 109793
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.63it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 85.99it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.68s/it]


| [0m 24      [0m | [0m 0.00495 [0m | [0m 0.866   [0m | [0m 0.9781  [0m | [0m 0.02728 [0m | [0m 0.5877  [0m | [0m 0.1871  [0m | [0m 3.203e+0[0m | [0m 88.07   [0m | [0m 9.202   [0m | [0m 703.7   [0m | [0m 0.5992  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.93s/it]


Vector size: 76464
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.42it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 83.36it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.92s/it]


| [0m 25      [0m | [0m 0.01485 [0m | [0m 0.3418  [0m | [0m 0.5134  [0m | [0m 0.09615 [0m | [0m 0.8192  [0m | [0m 0.1946  [0m | [0m 3.564e+0[0m | [0m 191.8   [0m | [0m 6.114   [0m | [0m 584.0   [0m | [0m 0.4331  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.89s/it]


Vector size: 138391
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.41it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.57it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.55s/it]


| [0m 26      [0m | [0m 0.0     [0m | [0m 0.4689  [0m | [0m 0.4065  [0m | [0m 0.2511  [0m | [0m 0.8352  [0m | [0m 0.7363  [0m | [0m 1.288e+0[0m | [0m 241.9   [0m | [0m 7.712   [0m | [0m 564.1   [0m | [0m 0.8173  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:51<00:00, 17.16s/it]


Vector size: 502433
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.59it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 70.15it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.64s/it]


| [0m 27      [0m | [0m 0.0     [0m | [0m 0.2241  [0m | [0m 0.1985  [0m | [0m 0.04302 [0m | [0m 0.6978  [0m | [0m 0.3989  [0m | [0m 3.682e+0[0m | [0m 322.1   [0m | [0m 1.19    [0m | [0m 233.2   [0m | [0m 0.6935  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.81s/it]


Vector size: 235416
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.53it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 67.50it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.43s/it]


| [0m 28      [0m | [0m 0.0     [0m | [0m 0.2638  [0m | [0m 0.3416  [0m | [0m 0.2104  [0m | [0m 0.8158  [0m | [0m 0.3657  [0m | [0m 2.34e+05[0m | [0m 199.6   [0m | [0m 4.693   [0m | [0m 734.5   [0m | [0m 0.86    [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.98s/it]


Vector size: 138427
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.32it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 85.91it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.66s/it]


| [0m 29      [0m | [0m 0.0495  [0m | [0m 0.6248  [0m | [0m 0.5432  [0m | [0m 0.124   [0m | [0m 0.1297  [0m | [0m 0.8449  [0m | [0m 3.365e+0[0m | [0m 31.32   [0m | [0m 7.376   [0m | [0m 848.3   [0m | [0m 0.2056  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.91s/it]


Vector size: 392460
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.36it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 70.67it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.92s/it]


| [0m 30      [0m | [0m 0.0     [0m | [0m 0.6216  [0m | [0m 0.9883  [0m | [0m 0.1214  [0m | [0m 0.3285  [0m | [0m 0.7079  [0m | [0m 3.168e+0[0m | [0m 239.7   [0m | [0m 2.707   [0m | [0m 428.9   [0m | [0m 0.7382  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.98s/it]


Vector size: 570055
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.10it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 72.53it/s]


Transforming:


100%|██████████| 3/3 [00:42<00:00, 14.14s/it]


| [0m 31      [0m | [0m 0.0     [0m | [0m 0.1557  [0m | [0m 0.0277  [0m | [0m 0.1311  [0m | [0m 0.8977  [0m | [0m 0.4818  [0m | [0m 4.357e+0[0m | [0m 29.86   [0m | [0m 1.625   [0m | [0m 349.8   [0m | [0m 0.3607  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.84s/it]


Vector size: 159474
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.34it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 76.24it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.52s/it]


| [0m 32      [0m | [0m 0.00495 [0m | [0m 0.6628  [0m | [0m 0.7135  [0m | [0m 0.1962  [0m | [0m 0.4965  [0m | [0m 0.6729  [0m | [0m 4.631e+0[0m | [0m 135.6   [0m | [0m 6.021   [0m | [0m 547.3   [0m | [0m 0.7127  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.86s/it]


Vector size: 189159
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.47it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.08it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.48s/it]


| [0m 33      [0m | [0m 0.0     [0m | [0m 0.1538  [0m | [0m 0.1666  [0m | [0m 0.4393  [0m | [0m 0.4137  [0m | [0m 0.5208  [0m | [0m 1.657e+0[0m | [0m 470.2   [0m | [0m 5.143   [0m | [0m 55.49   [0m | [0m 0.9341  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.56s/it]


Vector size: 84295
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.64it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 83.78it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.18s/it]


| [0m 34      [0m | [0m 0.002475[0m | [0m 0.7902  [0m | [0m 0.5966  [0m | [0m 0.06417 [0m | [0m 0.109   [0m | [0m 0.9224  [0m | [0m 4.576e+0[0m | [0m 370.8   [0m | [0m 7.481   [0m | [0m 160.8   [0m | [0m 0.8326  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.44s/it]


Vector size: 107107
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 83.45it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 13.00s/it]


| [0m 35      [0m | [0m 0.0     [0m | [0m 0.8882  [0m | [0m 0.1699  [0m | [0m 0.3794  [0m | [0m 0.6916  [0m | [0m 0.3344  [0m | [0m 4.788e+0[0m | [0m 478.1   [0m | [0m 3.979   [0m | [0m 318.1   [0m | [0m 0.7158  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.81s/it]


Vector size: 110666
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.70it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 81.67it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.19s/it]


| [0m 36      [0m | [0m 0.0     [0m | [0m 0.6746  [0m | [0m 0.2052  [0m | [0m 0.1079  [0m | [0m 0.7933  [0m | [0m 0.9685  [0m | [0m 2.963e+0[0m | [0m 173.5   [0m | [0m 9.554   [0m | [0m 19.18   [0m | [0m 0.9854  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:51<00:00, 17.21s/it]


Vector size: 129324
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.10it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.34it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.46s/it]


| [0m 37      [0m | [0m 0.05198 [0m | [0m 0.8067  [0m | [0m 0.7821  [0m | [0m 0.458   [0m | [0m 0.4187  [0m | [0m 0.9071  [0m | [0m 5.361e+0[0m | [0m 427.0   [0m | [0m 2.358   [0m | [0m 888.8   [0m | [0m 0.2965  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.90s/it]


Vector size: 234145
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.27it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 93.31it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.42s/it]


| [0m 38      [0m | [0m 0.02475 [0m | [0m 0.08825 [0m | [0m 0.9301  [0m | [0m 0.1988  [0m | [0m 0.2616  [0m | [0m 0.08231 [0m | [0m 1.906e+0[0m | [0m 272.8   [0m | [0m 4.253   [0m | [0m 292.7   [0m | [0m 0.2534  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.19s/it]


Vector size: 138074
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.83it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 92.83it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.81s/it]


| [0m 39      [0m | [0m 0.0     [0m | [0m 0.1981  [0m | [0m 0.45    [0m | [0m 0.3096  [0m | [0m 0.2172  [0m | [0m 0.3631  [0m | [0m 3.789e+0[0m | [0m 11.04   [0m | [0m 7.341   [0m | [0m 602.8   [0m | [0m 0.8329  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.63s/it]


Vector size: 159522
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.81it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.33it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.83s/it]


| [0m 40      [0m | [0m 0.002475[0m | [0m 0.2146  [0m | [0m 0.8475  [0m | [0m 0.136   [0m | [0m 0.8435  [0m | [0m 0.8077  [0m | [0m 3.112e+0[0m | [0m 399.2   [0m | [0m 6.117   [0m | [0m 557.8   [0m | [0m 0.749   [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:51<00:00, 17.06s/it]


Vector size: 316958
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.80it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.68it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.24s/it]


| [0m 41      [0m | [0m 0.0     [0m | [0m 0.2954  [0m | [0m 0.1559  [0m | [0m 0.2124  [0m | [0m 0.3207  [0m | [0m 0.164   [0m | [0m 3.172e+0[0m | [0m 473.7   [0m | [0m 3.028   [0m | [0m 967.4   [0m | [0m 0.6826  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:51<00:00, 17.06s/it]


Vector size: 471390
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.58it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 81.91it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.73s/it]


| [0m 42      [0m | [0m 0.04455 [0m | [0m 0.7641  [0m | [0m 0.9773  [0m | [0m 0.0623  [0m | [0m 0.3334  [0m | [0m 0.02887 [0m | [0m 3.987e+0[0m | [0m 187.0   [0m | [0m 2.003   [0m | [0m 481.8   [0m | [0m 0.3117  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.93s/it]


Vector size: 319077
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.56it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 68.69it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.82s/it]


| [0m 43      [0m | [0m 0.0396  [0m | [0m 0.8617  [0m | [0m 0.8548  [0m | [0m 0.1384  [0m | [0m 0.5048  [0m | [0m 0.6808  [0m | [0m 1.846e+0[0m | [0m 197.5   [0m | [0m 1.251   [0m | [0m 317.5   [0m | [0m 0.2633  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.56s/it]


Vector size: 76275
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.65it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.18it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.00s/it]


| [0m 44      [0m | [0m 0.0     [0m | [0m 0.6741  [0m | [0m 0.003397[0m | [0m 0.4969  [0m | [0m 0.6389  [0m | [0m 0.8443  [0m | [0m 2.543e+0[0m | [0m 348.1   [0m | [0m 1.918   [0m | [0m 421.1   [0m | [0m 0.2871  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.74s/it]


Vector size: 478969
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.23it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.97it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.78s/it]


| [0m 45      [0m | [0m 0.0     [0m | [0m 0.1394  [0m | [0m 0.7647  [0m | [0m 0.4374  [0m | [0m 0.669   [0m | [0m 0.07443 [0m | [0m 3.461e+0[0m | [0m 40.89   [0m | [0m 1.896   [0m | [0m 810.3   [0m | [0m 0.945   [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.41s/it]


Vector size: 110621
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.47it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 85.30it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.26s/it]


| [0m 46      [0m | [0m 0.02475 [0m | [0m 0.6203  [0m | [0m 0.04108 [0m | [0m 0.3582  [0m | [0m 0.3993  [0m | [0m 0.8041  [0m | [0m 2.696e+0[0m | [0m 170.5   [0m | [0m 9.991   [0m | [0m 181.3   [0m | [0m 0.1334  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.79s/it]


Vector size: 465704
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.18it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 76.43it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.01s/it]


| [0m 47      [0m | [0m 0.002475[0m | [0m 0.2649  [0m | [0m 0.8234  [0m | [0m 0.3501  [0m | [0m 0.3221  [0m | [0m 0.1402  [0m | [0m 3.323e+0[0m | [0m 245.0   [0m | [0m 1.222   [0m | [0m 49.62   [0m | [0m 0.9507  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.57s/it]


Vector size: 230611
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.82it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.49it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.37s/it]


| [0m 48      [0m | [0m 0.02723 [0m | [0m 0.1782  [0m | [0m 0.1706  [0m | [0m 0.2208  [0m | [0m 0.9025  [0m | [0m 0.4369  [0m | [0m 1.712e+0[0m | [0m 196.4   [0m | [0m 3.794   [0m | [0m 627.2   [0m | [0m 0.2098  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.58s/it]


Vector size: 475403
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.04it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 71.59it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.60s/it]


| [0m 49      [0m | [0m 0.0     [0m | [0m 0.03927 [0m | [0m 0.03917 [0m | [0m 0.1455  [0m | [0m 0.7006  [0m | [0m 0.4297  [0m | [0m 4e+05   [0m | [0m 311.2   [0m | [0m 2.579   [0m | [0m 128.5   [0m | [0m 0.6799  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.39s/it]


Vector size: 321193
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.75it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.70it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.40s/it]


| [0m 50      [0m | [0m 0.002475[0m | [0m 0.7927  [0m | [0m 0.5291  [0m | [0m 0.2224  [0m | [0m 0.1856  [0m | [0m 0.5609  [0m | [0m 1.868e+0[0m | [0m 236.8   [0m | [0m 1.454   [0m | [0m 396.4   [0m | [0m 0.8897  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.36s/it]


Vector size: 168275
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.62it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 74.46it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.22s/it]


| [0m 51      [0m | [0m 0.0     [0m | [0m 0.1661  [0m | [0m 0.5151  [0m | [0m 0.2667  [0m | [0m 0.6265  [0m | [0m 0.8121  [0m | [0m 1.087e+0[0m | [0m 124.1   [0m | [0m 3.179   [0m | [0m 371.9   [0m | [0m 0.9561  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.23s/it]


Vector size: 141010
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.26it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 82.41it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.16s/it]


| [0m 52      [0m | [0m 0.002475[0m | [0m 0.8632  [0m | [0m 0.9208  [0m | [0m 0.4081  [0m | [0m 0.8871  [0m | [0m 0.776   [0m | [0m 9.548e+0[0m | [0m 10.92   [0m | [0m 5.631   [0m | [0m 843.1   [0m | [0m 0.8757  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.06s/it]


Vector size: 158117
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.86it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 90.22it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.37s/it]


| [0m 53      [0m | [0m 0.002475[0m | [0m 0.09281 [0m | [0m 0.3145  [0m | [0m 0.4876  [0m | [0m 0.7952  [0m | [0m 0.1015  [0m | [0m 2.648e+0[0m | [0m 91.01   [0m | [0m 6.371   [0m | [0m 762.6   [0m | [0m 0.4506  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.18s/it]


Vector size: 110374
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.14it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 89.73it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.82s/it]


| [0m 54      [0m | [0m 0.0     [0m | [0m 0.1601  [0m | [0m 0.3176  [0m | [0m 0.4936  [0m | [0m 0.9909  [0m | [0m 0.4133  [0m | [0m 4.546e+0[0m | [0m 170.6   [0m | [0m 9.721   [0m | [0m 989.3   [0m | [0m 0.9143  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.27s/it]


Vector size: 206448
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.53it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 92.80it/s]


Transforming:


100%|██████████| 3/3 [00:36<00:00, 12.19s/it]


| [0m 55      [0m | [0m 0.02228 [0m | [0m 0.7373  [0m | [0m 0.09588 [0m | [0m 0.2027  [0m | [0m 0.6909  [0m | [0m 0.04423 [0m | [0m 1.491e+0[0m | [0m 375.6   [0m | [0m 3.033   [0m | [0m 240.9   [0m | [0m 0.1485  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.67s/it]


Vector size: 109841
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.33it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 91.59it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.75s/it]


| [0m 56      [0m | [0m 0.04208 [0m | [0m 0.08419 [0m | [0m 0.764   [0m | [0m 0.4633  [0m | [0m 0.8293  [0m | [0m 0.1968  [0m | [0m 1.139e+0[0m | [0m 339.2   [0m | [0m 9.754   [0m | [0m 21.19   [0m | [0m 0.1167  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.78s/it]


Vector size: 263113
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.39it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 71.91it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.89s/it]


| [0m 57      [0m | [0m 0.007426[0m | [0m 0.3552  [0m | [0m 0.1485  [0m | [0m 0.3696  [0m | [0m 0.4321  [0m | [0m 0.7391  [0m | [0m 2.035e+0[0m | [0m 18.29   [0m | [0m 3.987   [0m | [0m 211.9   [0m | [0m 0.3539  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.74s/it]


Vector size: 188726
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.16it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 82.89it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.89s/it]


| [0m 58      [0m | [0m 0.0     [0m | [0m 0.3264  [0m | [0m 0.3174  [0m | [0m 0.2146  [0m | [0m 0.8411  [0m | [0m 0.2534  [0m | [0m 1.461e+0[0m | [0m 450.7   [0m | [0m 5.79    [0m | [0m 59.1    [0m | [0m 0.4154  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.54s/it]


Vector size: 573370
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.18it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 66.74it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.95s/it]


| [0m 59      [0m | [0m 0.009901[0m | [0m 0.4473  [0m | [0m 0.4131  [0m | [0m 0.1799  [0m | [0m 0.9116  [0m | [0m 0.6209  [0m | [0m 4.389e+0[0m | [0m 308.7   [0m | [0m 1.97    [0m | [0m 653.7   [0m | [0m 0.3336  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.61s/it]


Vector size: 159027
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.10it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 79.29it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.92s/it]


| [0m 60      [0m | [0m 0.007426[0m | [0m 0.1796  [0m | [0m 0.8726  [0m | [0m 0.3895  [0m | [0m 0.5188  [0m | [0m 0.2826  [0m | [0m 1.928e+0[0m | [0m 376.9   [0m | [0m 6.617   [0m | [0m 895.5   [0m | [0m 0.6431  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.06s/it]


Vector size: 110636
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.47it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.89it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.25s/it]


| [0m 61      [0m | [0m 0.0     [0m | [0m 0.6208  [0m | [0m 0.7429  [0m | [0m 0.1442  [0m | [0m 0.09122 [0m | [0m 0.8526  [0m | [0m 2.491e+0[0m | [0m 461.3   [0m | [0m 9.017   [0m | [0m 647.7   [0m | [0m 0.975   [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.67s/it]


Vector size: 40746
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.03it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 91.45it/s]


Transforming:


100%|██████████| 3/3 [00:37<00:00, 12.56s/it]


| [0m 62      [0m | [0m 0.00495 [0m | [0m 0.01786 [0m | [0m 0.1247  [0m | [0m 0.3706  [0m | [0m 0.1153  [0m | [0m 0.3064  [0m | [0m 1.358e+0[0m | [0m 17.24   [0m | [0m 5.309   [0m | [0m 186.8   [0m | [0m 0.1863  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.79s/it]


Vector size: 159100
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.21it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.85it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.04s/it]


| [0m 63      [0m | [0m 0.01733 [0m | [0m 0.3722  [0m | [0m 0.3996  [0m | [0m 0.2777  [0m | [0m 0.4973  [0m | [0m 0.3127  [0m | [0m 1.837e+0[0m | [0m 389.0   [0m | [0m 6.065   [0m | [0m 754.9   [0m | [0m 0.3064  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.66s/it]


Vector size: 122259
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.10it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.75it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.96s/it]


| [0m 64      [0m | [0m 0.0     [0m | [0m 0.6718  [0m | [0m 0.6237  [0m | [0m 0.3085  [0m | [0m 0.3622  [0m | [0m 0.2712  [0m | [0m 1.302e+0[0m | [0m 468.9   [0m | [0m 8.643   [0m | [0m 1.014e+0[0m | [0m 0.9802  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.73s/it]


Vector size: 159345
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.80it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 81.83it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.40s/it]


| [0m 65      [0m | [0m 0.0     [0m | [0m 0.8243  [0m | [0m 0.8727  [0m | [0m 0.2024  [0m | [0m 0.1704  [0m | [0m 0.4748  [0m | [0m 3.046e+0[0m | [0m 488.1   [0m | [0m 6.923   [0m | [0m 296.8   [0m | [0m 0.8762  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.39s/it]


Vector size: 99492
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.08it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.44it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.94s/it]


| [0m 66      [0m | [0m 0.0     [0m | [0m 0.7691  [0m | [0m 0.05714 [0m | [0m 0.01803 [0m | [0m 0.4008  [0m | [0m 0.4224  [0m | [0m 5.418e+0[0m | [0m 157.5   [0m | [0m 5.342   [0m | [0m 930.7   [0m | [0m 0.8257  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:47<00:00, 15.94s/it]


Vector size: 138298
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.54it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 79.91it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.30s/it]


| [0m 67      [0m | [0m 0.0     [0m | [0m 0.4083  [0m | [0m 0.619   [0m | [0m 0.3114  [0m | [0m 0.3908  [0m | [0m 0.5529  [0m | [0m 1.66e+05[0m | [0m 45.37   [0m | [0m 7.676   [0m | [0m 680.0   [0m | [0m 0.8261  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.57s/it]


Vector size: 122310
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.11it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 91.58it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.73s/it]


| [0m 68      [0m | [0m 0.02723 [0m | [0m 0.1816  [0m | [0m 0.572   [0m | [0m 0.04061 [0m | [0m 0.1067  [0m | [0m 0.2911  [0m | [0m 2.115e+0[0m | [0m 227.7   [0m | [0m 8.381   [0m | [0m 254.4   [0m | [0m 0.2546  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.19s/it]


Vector size: 317934
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.42it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.21it/s]


Transforming:


100%|██████████| 3/3 [00:41<00:00, 13.82s/it]


| [0m 69      [0m | [0m 0.04703 [0m | [0m 0.5935  [0m | [0m 0.2127  [0m | [0m 0.1782  [0m | [0m 0.3913  [0m | [0m 0.8924  [0m | [0m 3.46e+05[0m | [0m 415.5   [0m | [0m 3.568   [0m | [0m 146.7   [0m | [0m 0.1656  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:51<00:00, 17.03s/it]


Vector size: 235633
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.75it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 73.73it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.63s/it]


| [0m 70      [0m | [0m 0.002475[0m | [0m 0.5439  [0m | [0m 0.2183  [0m | [0m 0.4356  [0m | [0m 0.3447  [0m | [0m 0.5501  [0m | [0m 2.398e+0[0m | [0m 307.9   [0m | [0m 4.328   [0m | [0m 635.9   [0m | [0m 0.5434  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.81s/it]


Vector size: 189063
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.22it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 80.29it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.35s/it]


| [0m 71      [0m | [0m 0.009901[0m | [0m 0.1368  [0m | [0m 0.512   [0m | [0m 0.04574 [0m | [0m 0.3439  [0m | [0m 0.4284  [0m | [0m 3.919e+0[0m | [0m 231.1   [0m | [0m 5.491   [0m | [0m 941.7   [0m | [0m 0.3282  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:50<00:00, 16.80s/it]


Vector size: 317744
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.91it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 78.97it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.45s/it]


| [95m 72      [0m | [95m 0.07178 [0m | [95m 0.603   [0m | [95m 0.8462  [0m | [95m 0.4238  [0m | [95m 0.963   [0m | [95m 0.3563  [0m | [95m 2.424e+0[0m | [95m 6.153   [0m | [95m 2.856   [0m | [95m 431.7   [0m | [95m 0.131   [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.56s/it]


Vector size: 168684
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.25it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 82.79it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.28s/it]


| [0m 73      [0m | [0m 0.0     [0m | [0m 0.9559  [0m | [0m 0.9824  [0m | [0m 0.2533  [0m | [0m 0.5811  [0m | [0m 0.9323  [0m | [0m 1.174e+0[0m | [0m 170.5   [0m | [0m 4.899   [0m | [0m 224.6   [0m | [0m 0.9569  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.54s/it]


Vector size: 189256
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.66it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 71.98it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.60s/it]


| [0m 74      [0m | [0m 0.0     [0m | [0m 0.5296  [0m | [0m 0.1281  [0m | [0m 0.3233  [0m | [0m 0.9105  [0m | [0m 0.6901  [0m | [0m 4.204e+0[0m | [0m 75.85   [0m | [0m 5.614   [0m | [0m 415.2   [0m | [0m 0.6169  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.47s/it]


Vector size: 198150
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.45it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 70.86it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.43s/it]


| [0m 75      [0m | [0m 0.007426[0m | [0m 0.5316  [0m | [0m 0.4871  [0m | [0m 0.1554  [0m | [0m 0.1681  [0m | [0m 0.8707  [0m | [0m 1.385e+0[0m | [0m 110.6   [0m | [0m 3.298   [0m | [0m 993.5   [0m | [0m 0.4419  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.55s/it]


Vector size: 264474
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.48it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 75.88it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.59s/it]


| [0m 76      [0m | [0m 0.0     [0m | [0m 0.9736  [0m | [0m 0.3016  [0m | [0m 0.3128  [0m | [0m 0.4889  [0m | [0m 0.7939  [0m | [0m 2.049e+0[0m | [0m 125.0   [0m | [0m 3.422   [0m | [0m 416.9   [0m | [0m 0.6387  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.38s/it]


Vector size: 137988
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.05it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 87.57it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.92s/it]


| [0m 77      [0m | [0m 0.0     [0m | [0m 0.5787  [0m | [0m 0.02431 [0m | [0m 0.1555  [0m | [0m 0.9569  [0m | [0m 0.3102  [0m | [0m 1.853e+0[0m | [0m 409.6   [0m | [0m 7.909   [0m | [0m 628.0   [0m | [0m 0.5564  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.37s/it]


Vector size: 138349
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.63it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 82.12it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.23s/it]


| [0m 78      [0m | [0m 0.0     [0m | [0m 0.07075 [0m | [0m 0.2582  [0m | [0m 0.4281  [0m | [0m 0.1925  [0m | [0m 0.6416  [0m | [0m 4.993e+0[0m | [0m 322.1   [0m | [0m 7.174   [0m | [0m 324.9   [0m | [0m 0.8769  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.50s/it]


Vector size: 431674
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.17it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 74.83it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.82s/it]


| [0m 79      [0m | [0m 0.0198  [0m | [0m 0.4337  [0m | [0m 0.7745  [0m | [0m 0.3136  [0m | [0m 0.5936  [0m | [0m 0.1095  [0m | [0m 3.573e+0[0m | [0m 318.8   [0m | [0m 2.49    [0m | [0m 427.5   [0m | [0m 0.3271  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.43s/it]


Vector size: 470568
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.30it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 79.01it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.78s/it]


| [0m 80      [0m | [0m 0.002475[0m | [0m 0.7486  [0m | [0m 0.3043  [0m | [0m 0.3527  [0m | [0m 0.02006 [0m | [0m 0.1318  [0m | [0m 3.96e+05[0m | [0m 23.98   [0m | [0m 2.588   [0m | [0m 1.021e+0[0m | [0m 0.5897  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.28s/it]


Vector size: 107875
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.25it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 88.80it/s]


Transforming:


100%|██████████| 3/3 [00:35<00:00, 11.99s/it]


| [0m 81      [0m | [0m 0.0     [0m | [0m 0.1207  [0m | [0m 0.1955  [0m | [0m 0.07372 [0m | [0m 0.1927  [0m | [0m 0.03631 [0m | [0m 1.295e+0[0m | [0m 461.5   [0m | [0m 9.989   [0m | [0m 43.65   [0m | [0m 0.5526  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.40s/it]


Vector size: 122828
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.70it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 77.46it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.95s/it]


| [0m 82      [0m | [0m 0.04208 [0m | [0m 0.4501  [0m | [0m 0.5975  [0m | [0m 0.2442  [0m | [0m 0.5249  [0m | [0m 0.9538  [0m | [0m 4.484e+0[0m | [0m 486.9   [0m | [0m 8.003   [0m | [0m 734.8   [0m | [0m 0.2898  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.54s/it]


Vector size: 245118
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.47it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 74.45it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.49s/it]


| [0m 83      [0m | [0m 0.0     [0m | [0m 0.4115  [0m | [0m 0.1216  [0m | [0m 0.3614  [0m | [0m 0.911   [0m | [0m 0.69    [0m | [0m 1.695e+0[0m | [0m 109.2   [0m | [0m 2.783   [0m | [0m 514.5   [0m | [0m 0.9592  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.58s/it]


Vector size: 120897
Transforming:


100%|██████████| 3/3 [00:00<00:00, 12.52it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 92.78it/s]


Transforming:


100%|██████████| 3/3 [00:36<00:00, 12.07s/it]


| [0m 84      [0m | [0m 0.0     [0m | [0m 0.1725  [0m | [0m 0.07458 [0m | [0m 0.0874  [0m | [0m 0.07784 [0m | [0m 0.06502 [0m | [0m 1.712e+0[0m | [0m 205.3   [0m | [0m 8.855   [0m | [0m 615.2   [0m | [0m 0.2677  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:47<00:00, 15.92s/it]


Vector size: 377382
Transforming:


100%|██████████| 3/3 [00:00<00:00, 10.20it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 72.34it/s]


Transforming:


100%|██████████| 3/3 [00:40<00:00, 13.52s/it]


| [0m 85      [0m | [0m 0.0     [0m | [0m 0.8289  [0m | [0m 0.1422  [0m | [0m 0.3289  [0m | [0m 0.2713  [0m | [0m 0.9429  [0m | [0m 2.428e+0[0m | [0m 257.4   [0m | [0m 1.026   [0m | [0m 217.6   [0m | [0m 0.6509  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:49<00:00, 16.54s/it]


Vector size: 394465
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.01it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 74.51it/s]


Transforming:


100%|██████████| 3/3 [00:39<00:00, 13.14s/it]


| [0m 86      [0m | [0m 0.01733 [0m | [0m 0.04424 [0m | [0m 0.7572  [0m | [0m 0.03205 [0m | [0m 0.3565  [0m | [0m 0.4114  [0m | [0m 2.602e+0[0m | [0m 86.22   [0m | [0m 1.273   [0m | [0m 1.016e+0[0m | [0m 0.2043  [0m |
Fitting vectorizers:


  "The parameter 'token_pattern' will not be used"
100%|██████████| 3/3 [00:48<00:00, 16.12s/it]


Vector size: 111186
Transforming:


100%|██████████| 3/3 [00:00<00:00, 11.38it/s]


Transforming:


100%|██████████| 3/3 [00:00<00:00, 83.39it/s]


Transforming:


100%|██████████| 3/3 [00:38<00:00, 12.71s/it]


### Train

In [16]:
pifa_concat = LabelEmbeddingFactory.create(train_question_paper_csr, 
                                           train_questions_csr, 
                                           Z=papers_csr, 
                                           method="pifa_lf_concat")

In [17]:
cluster_chain = Indexer.gen(pifa_concat, 
                            indexer_type="hierarchicalkmeans",
                            nr_splits=64)

In [18]:
xlm = XLinearModel.train(train_questions_csr, 
                         train_question_paper_csr, 
                         C=cluster_chain)

### Validation

In [19]:
predicted_question_paper_csr = xlm.predict(valid_questions_csr,
                                           beam_size=100,
                                           only_topk=10)

In [20]:
metric = smat_util.Metrics.generate(valid_question_paper_csr, 
                                    predicted_question_paper_csr, 
                                    topk=10)
print(metric)

prec   = 8.42 6.19 5.12 4.46 3.96 3.59 3.22 3.03 2.89 2.82
recall = 3.06 3.78 4.10 4.48 4.64 4.86 4.97 5.13 5.33 5.61


### Checking

In [21]:
def get_papers_by_text(text, 
                       vectorizer,
                       number_of_papers=10, 
                       check_correctness=False):
      
    print('Qestion: ', text)
    vector_text_csr = vectorizer.transform([text])

    papers_csr = xlm.predict(vector_text_csr,
                             beam_size=4,
                             only_topk=number_of_papers)

    prediction = papers_csr.toarray()
    paper_labels = np.nonzero(prediction)[1].tolist()

    # print()
    # print('Found papers: ', len(paper_labels))
    # print(paper_labels)

    paper_logists = np.take(prediction, paper_labels).tolist()

    result = papers_df.loc[paper_labels, ['title', 'abstract', 'snippet_url']]
    result['score'] = paper_logists
    result = result.sort_values('score', ascending=False)

    if check_correctness:
        correct_papers = df.loc[df.question_text == text, 'paper_id'].unique()
        print(f'\nTrue number of papers: {len(correct_papers)}')
        result['correctness'] = result.index.isin(correct_papers)

    return result

In [22]:
correct_examples = [
    'Which genome browser database for DNA shape annotations is available?',
    'Has strimvelis been approved by the European Medicines Agency?',
    'What is the target of Volanesorsen?',
    'Is eptinezumab a small molecule?'
]

interesting_examples = [
    'For which type of diabetes can empagliflozin be used?',
    "Is Semagacestat effective for Alzheimer's Disease?",
    'Which gene is associated with Muenke syndrome?'     
]

In [None]:
get_papers_by_text('Which gene is associated with Muenke syndrome?',
                   vectorizer=vectorizer, 
                   number_of_papers=30,
                   check_correctness=True)

In [None]:
# Random example
temp = valid_questions_df.question_text.sample().values[0]
print(temp)

get_papers_by_text(temp,
                   vectorizer=vectorizer,
                   number_of_papers=30,
                   check_correctness=True)

### Highlight answers

In [None]:
question_text = 'Which genome browser database for DNA shape annotations is available?'

In [None]:
result = get_papers_by_text(
    question_text,
    vectorizer=vectorizer, 
    number_of_papers=30,
    check_correctness=True
)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("ozcangundes/T5-base-for-BioQA")
model = T5ForConditionalGeneration.from_pretrained("ozcangundes/T5-base-for-BioQA")

def get_answer(question, context):
    source_encoding=tokenizer(
        question,
        context,
        max_length=512,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt")

    generated_ids=model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"])

    preds=[tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]
    
    return "".join(preds)

# To highlight the answers in the results
def highlight_selected_text(row):
    abstract = row["abstract"]
    title = row["title"]
    ext = []
    ext.append(row["abstract_answer"])
    ext.append(row['title_answer'])
    # for k, v in color.items():
    for k in ext:
      if len(k) > 2:
        abstract = abstract.replace(k, f'<span style="color: red; background-color: yellow; font-weight: bold">{k}</span>')
        title = title.replace(k, f'<span style="color: red; background-color: yellow; font-weight: bold">{k}</span>')
    return abstract, title

In [None]:
result['abstract_answer'] = result['abstract'].apply(lambda x: get_answer(question_text, x))
result['title_answer'] = result['title'].apply(lambda x: get_answer(question_text, x))

In [None]:
from IPython.core.display import HTML
result['abstract'], result['title'] = zip(*result.apply(highlight_selected_text, axis=1))
display(HTML(result.sample(10).to_html(escape=False)))