In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time
import datetime
import gc
import random
import re
import operator
import pickle
from tqdm import tqdm
import pkg_resources
import scipy.stats as stats
import sys


import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,TensorDataset,Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim.optimizer import Optimizer

# stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

%load_ext autoreload
%autoreload 2
%matplotlib inline

import shutil

device=torch.device('cuda')

tqdm.pandas()
t1 = datetime.datetime.now()

In [2]:
MAX_SEQUENCE_LENGTH = 200
SEED = 42
EPOCHS = 1
Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input"
WORK_DIR = "../working/"
total_train_size = 1804874
num_to_load = 1000000                         # Train size to match time limit
valid_size = 97320                            # Validation Size
TOXICITY_COLUMN = 'target'

In [3]:
# Add the Bart Pytorch repo to the PATH
# using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
# bert base uncased....
BERT_MODEL_PATH = "../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/"
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam

In [4]:
%%time
train = pd.read_csv(os.path.join(Data_dir,"train.csv"))
print(train.shape)
train.head()

(1804874, 45)
CPU times: user 15.5 s, sys: 3.56 s, total: 19.1 s
Wall time: 19.1 s


In [5]:
# as we are using cased bert model
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)


# # Converting the lines to BERT format
# # Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
# tokenizer and sequence length will be global variables. 
def convert_lines(x):
    tokens_a = tokenizer.tokenize(x)
    if len(tokens_a) > MAX_SEQUENCE_LENGTH:
        tokens_a = tokens_a[:MAX_SEQUENCE_LENGTH]
    
    one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + \
                [0]*(MAX_SEQUENCE_LENGTH-len(tokens_a))
    
    return np.array(one_token)

In [6]:
# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 

# takes about 35 minutes for the total train data. 
# sequences = np.array(train['comment_text'].progress_apply(convert_lines).tolist())

from joblib import Parallel, delayed

In [7]:
%%time
# takes 14 min
sequences = np.array(Parallel(n_jobs=4,backend="multiprocessing")(delayed(convert_lines)(x) for x in train['comment_text']))
print(sequences.shape)

(1804874, 202)
CPU times: user 43.2 s, sys: 17.9 s, total: 1min 1s
Wall time: 16min 9s


In [8]:
print(sequences.shape)
file_name = "first_lines_array_max_length_"+str(MAX_SEQUENCE_LENGTH+2)
np.save(file_name,sequences)

(1804874, 202)
