In [1]:
import tensorflow as tf

# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [3]:
import torch

if torch.cuda.is_available():

  # Tell Pytorch to use the GPU
  device = torch.device('cuda')

  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!pip install transformers

In [10]:
import urllib
import os

if not os.path.exists('./data/'):
  os.mkdir('./data/')

files = [
         ('./data/attack_annotated_comments.tsv', 'https://ndownloader.figshare.com/files/7554634'),
         ('./data/attack_annotations.tsv',        'https://ndownloader.figshare.com/files/7554637')
]

for (filename, url) in files:
  if not os.path.exists(filename):
    print('Downloading:', filename)
    urllib.request.urlretrieve(url, filename)
    print('Done!')

In [11]:
import pandas as pd

print('Parsing the dataset .tsv file...')
comments = pd.read_csv('./data/attack_annotated_comments.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('./data/attack_annotations.tsv', sep='\t')
print('Done!')

Parsing the dataset .tsv file...
Done!


In [12]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
93890,This page will need disambiguation.,2002,True,article,random,train


In [13]:
comments[['comment', 'split']].groupby('split').count()

Unnamed: 0_level_0,comment
split,Unnamed: 1_level_1
dev,23160
test,23178
train,69526


In [14]:
# create labels and join them with comments
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

comments['attack'] = labels

In [15]:
comments['comment'] = comments['comment'].apply(lambda x: x.replace('NEWLINE_TOKEN', ' '))
comments['comment'] = comments['comment'].apply(lambda x: x.replace('TAB_TOKEN', ' '))

In [17]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

In [20]:
import textwrap
import random

wrapper = textwrap.TextWrapper(width=80)

attack_examples = train_comments.query('attack')['comment']

for i in range(10):
  j = random.choice(attack_examples.index)

  print('')
  print(wrapper.fill(attack_examples[j]))
  print('')


 ::::JUDENSCWEIN TO THE OVENS !  LIARS ! ALL OF YOU !


  :: Go Fuck yourself Shell, I couldn't care less what a fucked up waste of
space like you thinks. Since when could an editor not edit articles on
Wikipedia? It seems you need to kiss a lot of arseholes to make a difference
here now.


  who died and made you god


  == Don't hide the incriminating photographs you Nazi Americans!!!!!! ==  If
the photos are graphic its because the horrible deeds committed by the
deranges,sadistic perverted soldiers are also equally disturing.I think the
Yanks should stop trying to cover up these issues.


  go fuck yourself you fucking bitch


 Anal leakage ban hammer?


`== Wonderfull... this lesbian feminist ``nontheist christian`` ``fisherqueen``
still moderates wikipedia... == They should ban such feeble brain vertebrates
like her from public life. Or she should get a husband and listen to him
carefuly  it would mitigate the damage which she does in society. The less you
do, the more are you u

In [21]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [25]:
text = train_comments.iloc[0].comment
tokens = tokenizer.tokenize(text)

# print out the list of tokens and see what we truncate
print('==== First 512 tokens: ====\n')
print(wrapper.fill(str(' '.join(tokens[0:512]))))
print('')
print('\n==== Remaining {:,} tokens: ====\n'.format(len(tokens) - 512))
print(wrapper.fill(str(' '.join(tokens[512:]))))

==== First 512 tokens: ====

` - this is not ` ` creative ` ` . those are the dictionary definitions of the
terms ` ` insurance ` ` and ` ` en ##sur ##ance ` ` as properly applied to ` `
destruction ` ` . if you don ' t understand that , fine , legitimate criticism ,
i ' ll write up ` ` three man cell ` ` and ` ` bounty hunter ` ` and then it
will be easy to understand why ` ` ensured ` ` and ` ` ins ##ured ` ` are
different - and why both differ from ` ` assured ` ` . the sentence you quote is
absolutely neutral . you just aren ' t familiar with the underlying theory of
strike - back ( e . g . submarines as employed in nuclear warfare ) guiding the
insurance , nor likely the three man cell structure that kept the ira from being
broken by the british . if that ' s my fault , fine , i can fix that to explain
. but the ##r ' es nothing ` ` personal ` ` or ` ` creative ` ` about it . i ' m
tired of arguing with you . re : the other article , ` ` multi - party ` ` turns
up plenty , and the

In [None]:
# A few ways to deal with this problem is truncation and chunking

import numpy as np

input_ids = []
lengths = []

print('Tokenizing comments...')
for sent in train_comments.comment:
  if ((len(input_ids) % 20000) == 0):
    print('Read {:,} comments.'.format(len(input_ids)))
    
  # 'encode' will:
  # (1) tokenize the sentence
  # (2) prepend the [CLS] token to the start
  # (3) append the [SEP] token to the start
  # (4) map tokens to their IDs
  encoded_sent = tokenizer.encode(
        sent,                      # sentence to encode
        add_special_tokens = True, # add [CLS] and [SEP]
        #max_length = 512,
        #return_tensors = 'pt',
    )

  input_ids.append(encoded_sent)
  lengths.append(len(encoded_sent))

print('Done!')
print('{:>10,} comments'.format(len(input_ids)))