In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import gc
import sys

import numpy as np
import pandas as pd
import random
import shutil
import pickle

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

tqdm.pandas()
print(os.listdir("./data/"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(1235)

In [None]:
start_time = time.time()
print("Establishing Global Variables ...")

# Data Directory
directory = './data/'

# Torch Device
device = torch.device('cuda')

# Model Parameters
max_length = 220
batch_size = 64
n_epochs = 2
accumulation_steps = 1

# Model/Split Seed/Parameters
# Change model_seed with every new/different model
# Keep split_seed the same throughout
model_seed = 1234
current_split = 0

# Model File Paths
TRAIN_FILE = directory + 'train.csv'
TEST_FILE  = directory + 'test.csv'
PROCESSED_FILE = 'train_seq.pickle'

# Directory/BERT Paths
WORK_DIR = directory
BERT_MODEL_PATH = directory + 'uncased_L-12_H-768_A-12/'
BERT_WEIGHT_PATH = 'bert_pytorch_model.bin'

convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')
bert_config = BertConfig(BERT_MODEL_PATH + 'bert_config.json')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = (tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a)))
        all_tokens.append(one_token)
    
    return np.array(all_tokens)

In [None]:
start_time = time.time()
print("Processing Data ...")

bert_train = pd.read_csv(TRAIN_FILE)
bert_train['comment_text'] = bert_train['comment_text'].astype(str) 
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True)
sequences = convert_lines(bert_train["comment_text"].fillna("DUMMY_VALUE"), max_length, tokenizer)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data ...")

pickle_out = open("train_seq.pickle","wb")
pickle.dump(sequences, pickle_out)
pickle_out.close()