In [1]:
import transformers as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers_interpret import SequenceClassificationExplainer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def load_model_pipeline(username, prefix, model_name, device='cpu'):
    p = tf.pipeline('text-classification', f'{username}/{prefix}-{model_name}', return_all_scores=True, device=device)
    return p

def load_data(data_folder):
    df_train = pd.read_pickle(f'{data_folder}/train.pkl')
    df_test = pd.read_pickle(f'{data_folder}/test.pkl')
    df = pd.concat([df_train, df_test])
    return df

def clean_data(df : pd.DataFrame, p : tf.Pipeline, na_repl='blank', max_tokens=256, min_tokens=5):
    print(f'Original df size: {df.shape}')

    print(f'Filling NAN in comment with "{na_repl}"')
    df['comment'] = df.comment.fillna(na_repl)

    df['tf_toklen'] = [len(toks) for toks in p.tokenizer(df.comment.tolist())['input_ids']]
    df = df[df['tf_toklen'] <= max_tokens]
    print(f'Size after filtering comments longer than {max_tokens} tokens: {df.shape}')

    df = df[df['tf_toklen'] > min_tokens]
    print(f'Size after filtering comments shorter than {min_tokens} tokens: {df.shape}')

    return df

def get_word_importances(comments : list[str], p : tf.Pipeline, relative_to: str):

    explainer = SequenceClassificationExplainer(p.model, p.tokenizer)

    word_imports = {}
    for comment in tqdm(comments):
        attribs = explainer(comment, class_name=relative_to)
        for w, score in attribs:
            try:
                word_imports[w].append(score)
            except KeyError:
                word_imports[w] = [score]
    return word_imports

def save_word_importances(word_imports : dict, path : str):
    print(f'Saving word importances to {path}')
    with open(path, 'wb') as f:
        pickle.dump(word_imports, f)


In [9]:
p = load_model_pipeline('maxspad','nlp-qual','qual')
df = load_data('../data/processed/')
df = clean_data(df, p, max_tokens=256, min_tokens=5)



Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 256 tokens: (2453, 47)
Size after filtering comments longer than 5 tokens: (2296, 47)


In [7]:
tok_lens = pd.Series([len(toks) for toks in p.tokenizer(df.comment.tolist())['input_ids']])
tok_lens.describe()

count    2453.000000
mean       42.207501
std        45.608177
min         3.000000
25%        14.000000
50%        25.000000
75%        50.000000
max       254.000000
dtype: float64

In [3]:
models = ['q1','q2i','q3i','qual']
reference_labels = [['LABEL_2'],['LABEL_0'],['LABEL_0'],['LABEL_1','LABEL_3','LABEL_5']]
data_dir = '../data/processed/'
max_tokens = 255
save_dir = '../results/word_importances'

for model, ref_labs in zip(models, reference_labels):
    print('#'*80)
    print(f'Calculating word importances for model {model}')

    p = load_model_pipeline('maxspad','nlp-qual',model)
    
    df = load_data('../data/processed/')
    df = clean_data(df, p, max_tokens=256)
    
    for ref_lab in ref_labs:
        print(f'Reference label {ref_lab}')
        word_imports = get_word_importances(df.comment.tolist(), p, ref_lab)
        save_word_importances(word_imports, f'{save_dir}/{model}_{ref_lab}.pkl')

    print('\n')


################################################################################
Calculating word importances for model q1




Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 256 tokens: (2453, 47)
Reference label LABEL_2


  0%|          | 0/2453 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 2453/2453 [1:01:56<00:00,  1.51s/it]


Saving word importances to ../results/word_importances/q1_LABEL_2.pkl


################################################################################
Calculating word importances for model q2i




Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 256 tokens: (2453, 47)
Reference label LABEL_0


100%|██████████| 2453/2453 [1:01:12<00:00,  1.50s/it]


Saving word importances to ../results/word_importances/q2i_LABEL_0.pkl


################################################################################
Calculating word importances for model q3i




Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 256 tokens: (2453, 47)
Reference label LABEL_0


100%|██████████| 2453/2453 [57:44<00:00,  1.41s/it]  


Saving word importances to ../results/word_importances/q3i_LABEL_0.pkl


################################################################################
Calculating word importances for model qual




Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 256 tokens: (2453, 47)
Reference label LABEL_1


100%|██████████| 2453/2453 [59:32<00:00,  1.46s/it]  


Saving word importances to ../results/word_importances/qual_LABEL_1.pkl
Reference label LABEL_3


100%|██████████| 2453/2453 [59:26<00:00,  1.45s/it]  


Saving word importances to ../results/word_importances/qual_LABEL_3.pkl
Reference label LABEL_5


100%|██████████| 2453/2453 [1:00:39<00:00,  1.48s/it]

Saving word importances to ../results/word_importances/qual_LABEL_5.pkl





