In [1]:

import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f03d021dc10>

In [2]:
import pandas as pd
from os.path import dirname, join
import numpy as np

In [3]:
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [4]:
from transformers import AutoModelForMaskedLM,  AutoTokenizer

In [5]:

# model_name = '/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/checkpoint-40000'
# bert_model = AutoModelForMaskedLM.from_pretrained(model_name)

In [16]:
train_filename = 'train_text_imaging_only.txt'
valid_filename = 'valid_text_imaging_only.txt'
input_dir = 'gs://profile-notes/geekfest_files/unlabeled_data/'


In [17]:
train_data =  pd.read_csv(join(input_dir,train_filename) )
valid_data =  pd.read_csv(join(input_dir,valid_filename) )

In [18]:
train_data = train_data[train_data.text.str.len() > 50]
valid_data = valid_data[valid_data.text.str.len() > 50]

In [19]:
train_data.shape, valid_data.shape

((661524, 1), (79354, 1))

In [20]:
max_len = 512
bert_model_base = 'google/bert_uncased_L-12_H-768_A-12'
tokenizer = AutoTokenizer.from_pretrained(bert_model_base, truncation=False, padding=True, max_len=max_len)

In [21]:
def truncate(x, n):
    ret = [x[0]] + x[-n:]
    return ret

class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer, max_length):
        self.text = text
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, idx):
        item = self.text[idx]
        # item_tokenized = self.tokenizer(item, truncation=True, padding=True, return_special_tokens_mask=True)
        item_tokenized = self.tokenizer.batch_encode_plus([item], max_length= self.max_length, padding='max_length',
                                                  truncation=False, return_special_tokens_mask=True, pad_to_max_length = True)

        trucated_tokens = {key: truncate(val[0], self.max_length - 1) for key, val in item_tokenized.items()}
        ret = {key: torch.tensor(val) for key, val in trucated_tokens.items()}
        
#         item_tokenized = self.tokenizer(item, truncation=True, padding='max_length', return_special_tokens_mask=True)
#         ret = {key: torch.tensor(val) for key, val in item_tokenized.items()}
        
#         print (idx, len(ret['input_ids']),len(ret['token_type_ids']), len(ret['attention_mask']), len(ret['special_tokens_mask']))
        return ret

    def __len__(self):
        return len(self.text)

In [22]:
input_dataset = TorchDataset(train_data['text'].values, tokenizer, max_length=max_len)
valid_dataset = TorchDataset(valid_data['text'].values, tokenizer, max_length=max_len)
small_dataset = TorchDataset(valid_data['text'].values[0:1000], tokenizer, max_length=max_len)

In [23]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

In [73]:
def get_trainer(model_name):
    bert_model = AutoModelForMaskedLM.from_pretrained(model_name)
    
    output_dir = '.'
    logging_dir = '.'
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 8
    epoch_steps = round(train_data.shape[0]/per_device_train_batch_size) #roughly 1 epoch

    training_args = TrainingArguments(
        output_dir= output_dir,
        logging_dir =logging_dir,  
        overwrite_output_dir=False,
        num_train_epochs=10,
        per_device_train_batch_size= per_device_train_batch_size,
        per_device_eval_batch_size =per_device_eval_batch_size, 
        save_steps = epoch_steps , 
    #         save_steps=40_000,

        do_eval = False,
        do_train = False,
        evaluation_strategy='steps',
        eval_steps = epoch_steps
    #         save_strategy = 'epoch'
    #         save_total_limit=2,
    #         max_steps=100
    )

    trainer = Trainer(
        model=bert_model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=input_dataset,
        eval_dataset = valid_dataset
    #     prediction_loss_only=True,
    )
    return trainer

In [60]:
# output_dir = '.'
# logging_dir = '.'
# per_device_train_batch_size = 8
# epoch_steps = round(train_data.shape[0]/per_device_train_batch_size) #roughly 1 epoch

# training_args = TrainingArguments(
#     output_dir= output_dir,
#     logging_dir =logging_dir,  
#     overwrite_output_dir=False,
#     num_train_epochs=10,
#     per_device_train_batch_size= per_device_train_batch_size,
#     save_steps = epoch_steps , 
# #         save_steps=40_000,

#     do_eval = True,
#     evaluation_strategy='steps',
#     eval_steps = epoch_steps
# #         save_strategy = 'epoch'
# #         save_total_limit=2,
# #         max_steps=100
# )

# trainer = Trainer(
#     model=bert_model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=input_dataset,
#     eval_dataset = valid_dataset
# #     prediction_loss_only=True,
# )

In [26]:
len(input_dataset)

661524

In [27]:
len(valid_dataset)

79354

In [None]:
# input_dataset[10,]

In [None]:
%%time
train_metrics = trainer.evaluate(valid_dataset)

In [23]:
train_metrics

{'eval_loss': 0.5041759610176086}

In [28]:
import os

In [70]:
# rootdir = '/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output'
rootdir = '/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output'

In [71]:
def get_mode_names():
    model_names = []
    for file in os.listdir(rootdir):
        d = os.path.join(rootdir, file)
        if os.path.isdir(d):
            print(d)
            model_names.append(d)
    
    return sorted(model_names)

In [72]:
model_names= get_mode_names()

/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-80000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-40000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-360000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-160000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-760000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-640000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-280000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-720000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-800000
/home/haithamelmarakeby/pretrained_models_truncated/base/train/output/output/checkpoint-240000
/home/haithamelmarakeby/pretrained_models_truncated/

In [74]:
%%time
train_metrics = []
for model_path in model_names:
    model_name = os.path.basename(model_path)
    trainer = get_trainer(model_path)
    train_metric = trainer.evaluate(small_dataset)
    train_metric['model'] = model_name
    print(model_name, train_metric)
    train_metrics.append(train_metric)

checkpoint-120000 {'eval_loss': 0.4237876534461975, 'model': 'checkpoint-120000'}


checkpoint-160000 {'eval_loss': 0.40515416860580444, 'model': 'checkpoint-160000'}


checkpoint-200000 {'eval_loss': 0.39496347308158875, 'model': 'checkpoint-200000'}


checkpoint-240000 {'eval_loss': 0.38409170508384705, 'model': 'checkpoint-240000'}


checkpoint-280000 {'eval_loss': 0.3742303252220154, 'model': 'checkpoint-280000'}


checkpoint-320000 {'eval_loss': 0.3672785758972168, 'model': 'checkpoint-320000'}


checkpoint-360000 {'eval_loss': 0.35976532101631165, 'model': 'checkpoint-360000'}


checkpoint-40000 {'eval_loss': 0.5019106268882751, 'model': 'checkpoint-40000'}


checkpoint-400000 {'eval_loss': 0.3530645966529846, 'model': 'checkpoint-400000'}


checkpoint-440000 {'eval_loss': 0.34659722447395325, 'model': 'checkpoint-440000'}


checkpoint-480000 {'eval_loss': 0.3409278094768524, 'model': 'checkpoint-480000'}


checkpoint-520000 {'eval_loss': 0.3328251838684082, 'model': 'checkpoint-520000'}


checkpoint-560000 {'eval_loss': 0.3300130367279053, 'model': 'checkpoint-560000'}


checkpoint-600000 {'eval_loss': 0.32388362288475037, 'model': 'checkpoint-600000'}


checkpoint-640000 {'eval_loss': 0.318400114774704, 'model': 'checkpoint-640000'}


checkpoint-680000 {'eval_loss': 0.3134744167327881, 'model': 'checkpoint-680000'}


checkpoint-720000 {'eval_loss': 0.3109282851219177, 'model': 'checkpoint-720000'}


checkpoint-760000 {'eval_loss': 0.3076251447200775, 'model': 'checkpoint-760000'}


checkpoint-80000 {'eval_loss': 0.45577412843704224, 'model': 'checkpoint-80000'}


checkpoint-800000 {'eval_loss': 0.3041076064109802, 'model': 'checkpoint-800000'}
CPU times: user 13min 53s, sys: 4min 33s, total: 18min 27s
Wall time: 20min 7s


In [46]:
df = pd.DataFrame(train_metrics)

In [53]:
df.to_csv('base_scores_small.csv')

In [7]:
for file in os.listdir(rootdir):
    d = os.path.join(rootdir, file)
    if os.path.isdir(d):
        print(d)

/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-79352
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-89271
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-59514
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-99190
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-29757
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-9919
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-39676
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-49595
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoint-69433
/home/haithamelmarakeby/pretrained_models_truncated_revision/tiny/validation/output/checkpoi