In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time
import datetime
import gc
import random
import re
import operator
import pickle
from tqdm import tqdm
import pkg_resources
import scipy.stats as stats
import sys


import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,TensorDataset,Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim.optimizer import Optimizer

# stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

%load_ext autoreload
%autoreload 2
%matplotlib inline

import shutil

device=torch.device('cuda')

tqdm.pandas()
t1 = datetime.datetime.now()

In [2]:
MAX_SEQUENCE_LENGTH = 200
SEED = 42
EPOCHS = 1
Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input"
WORK_DIR = "../working/"
total_train_size = 1804874
TOXICITY_COLUMN = 'target'

In [3]:
# Add the Bart Pytorch repo to the PATH
# using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/gpt2-pytorch/pytorch-pretrained-bert-master/pytorch-pretrained-BERT-master/"
# bert base uncased....
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
from pytorch_pretrained_bert.modeling_gpt2 import GPT2ClassificationHeadModel

In [4]:
%%time
train = pd.read_csv(os.path.join(Data_dir,"train.csv"))
test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
print(train.shape)
display(train.head())

(1804874, 45)


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


CPU times: user 18.6 s, sys: 6.22 s, total: 24.8 s
Wall time: 25 s


In [5]:
GPT_MODEL_PATH = "../input/gpt2-models/"
# as we are using cased bert model
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=GPT_MODEL_PATH,cache_dir=None)


# # Converting the lines to BERT format
# # Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
# tokenizer and sequence length will be global variables. 
def convert_lines(x):
    tokens_a = tokenizer.tokenize(x)
    if len(tokens_a) > MAX_SEQUENCE_LENGTH:
        tokens_a = tokens_a[-MAX_SEQUENCE_LENGTH:]
    
    one_token = tokenizer.convert_tokens_to_ids(tokens_a) + \
                [0]*(MAX_SEQUENCE_LENGTH-len(tokens_a))
    
    return np.array(one_token)

In [6]:
# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 
test['comment_text'] = test['comment_text'].astype(str)
# takes about 35 minutes for the total train data. 
# sequences = np.array(train['comment_text'].progress_apply(convert_lines).tolist())

from joblib import Parallel, delayed

In [7]:
%%time
# takes 14 min
sequences = np.array(Parallel(n_jobs=4,backend="multiprocessing")(delayed(convert_lines)(x) for x in train['comment_text']))
print(sequences.shape)

(1804874, 200)
CPU times: user 42.3 s, sys: 16.9 s, total: 59.2 s
Wall time: 5min 16s


In [8]:
sequences[0]

array([ 1212,   318,   523,  3608,    13,   632,   338,   588,    11,
         705, 19188,   345,   765,   534,  2802,   284,  1100,   428,
        3548,     6, 16123,  1049,  2126,    11,   880,  1760,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [9]:
print(sequences.shape)
file_name = "lines_array_max_length_"+str(MAX_SEQUENCE_LENGTH)
np.save(file_name,sequences)

(1804874, 200)
