## Load Data

In [6]:
import pandas as pd

In [12]:
import numpy as np

In [7]:
PATH ='data/'

In [8]:
train = pd.read_csv('data/kaggle/train.csv')
test = pd.read_csv('data/kaggle/test.csv')
sample = pd.read_csv('data/kaggle/sample_submission.csv')

In [9]:
train._get_numeric_data().mean()

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

The training class is highly unbalanced, need to pay attention to this.

## Data processing

### CSV File Preprocessing

In [10]:
# Need to remvoe the space to make sure we are able to make the torchtext working properly

In [11]:
train['comment_text']=train.comment_text.str.replace('\n',' ')

In [14]:
idx = np.arange(train.shape[0])

In [16]:
np.random.seed(999)

In [17]:
np.random.shuffle(idx)

In [19]:
val_size = int(len(idx)*0.2)

In [22]:
% mkdir cache

In [23]:
train.loc[idx[val_size:],:].to_csv('cache/train.csv',index=False)

In [24]:
train.loc[idx[:val_size],:].to_csv('cache/validation.csv',index=False)

In [25]:
test['comment_text']=test['comment_text'].str.replace('\n',' ')

In [26]:
test.to_csv('cache/test.csv',index=False)

### Tokenization

In [70]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 10000
def tokenizer(comment):
    comment = re.sub(
    r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!\<\>;]", " ", 
    str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment)>MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return[
        x.text for x in NLP.tokenizer(comment) if x.text!='']

### Load the dataset
Here we will load data from cache files and process all the data.

In [71]:
import torch
from torchtext import data

In [72]:
from torchtext import *

In [73]:
??data.TabularDataset

In [106]:
fix_length=100; lower=False; vectors=True

In [107]:
comment = data.Field(
    sequential=True,
    fix_length=fix_length,
    tokenize=tokenizer,
    pad_first=True,
    tensor_type=torch.cuda.LongTensor,
    lower=lower
)

In [108]:
labels=data.Field(
use_vocab=False, sequential=False,
                tensor_type=torch.cuda.ByteTensor)

In [109]:
train,val = data.TabularDataset.splits(
    path = 'cache/',format='csv',skip_header=True,
    train = 'train.csv',validation='validation.csv',
    fields = [
        ('id',None),
        ('comment_text',comment),
        ('severe_toxic', labels),
        ('obscene', labels),
        ('threat', labels),
        ('insult', labels),
        ('identity_hate', labels)
    ]
)

In [163]:
test = data.TabularDataset(
    path='cache/test.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment)
    ])

### Build vocabulary

In [111]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50)

## Create Batches and Iterate Through dataset

In [115]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50,vectors="glove.6B.100d")

.vector_cache/glove.6B.zip: 862MB [02:27, 5.85MB/s]                             
100%|██████████| 400000/400000 [00:14<00:00, 27018.53it/s]


In [None]:
comment.vocab.

In [None]:
# We can also randomly generate embedding and build the vocabulary

In [None]:
dataset_iter = data.Iterator(train,batch_size=64,device=0,train=True,shuffle=True,repeat=False, sort=False)

In [None]:
for examples in dataset_iter:
    x=examples.comment_text
    y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

In [175]:
test_iter = data.Iterator(test,batch_size=len(test),device=0,train=False,shuffle=False,repeat=False, sort=False)

In [None]:
for examples in test_iter:
    x=examples.comment_text

## Build Model

In [152]:
dataset_iter

<torchtext.data.iterator.Iterator at 0x7f222f724cf8>

## Train model