
## Installing transformers and tokenizers libraries

In [1]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 2.8MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.7.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.5MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/

## Importing dependencies

In [2]:
from transformers import *
import numpy as np
import tensorflow as tf
import keras
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import random
from google.colab import drive
import tensorflow.keras.backend as K
import sentencepiece as spm
from tensorflow.keras.optimizers import Adam
import pickle
import math

Using TensorFlow backend.


## Mounting gdrive (if you are using colab)

In [0]:
drive.mount('/content/gdrive')

## Importing tokenizer and xlnet

In [0]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased' ,
                                           do_lower_case = True)
xlnet = TFXLNetModel.from_pretrained('xlnet-base-cased')

In [0]:
%cd '/content/gdrive'

## Reading training and test data

In [0]:
train_dir = '/content/gdrive/My Drive/tweet sentiment/train.csv'
test_dir = '/content/gdrive/My Drive/tweet sentiment/test.csv'
train = pd.read_csv(train_dir).fillna(' ')
test = pd.read_csv(test_dir).fillna(' ')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [0]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


## priniting random tokenized sentence

In [0]:
k = random.randrange(train.shape[0])
example = train.loc[k ,'text']
enc = tokenizer.encode(example)
print('statement is \'{}\''.format(example))
print('encoding is {}'.format(enc))
sentence = ''
for en in enc:
    token = tokenizer._convert_id_to_token(en)
    print('{} : {}'.format(token ,en))
    if token != '<sep>' and token != '<cls>':
        sentence = sentence + token
sentence = sentence.replace('▁' ," ").strip()
print(sentence)

statement is ' Ryaaaaaaaaaaaaan  http://bit.ly/SnjEn'
encoding is [17, 844, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 262, 4538, 4315, 2802, 9, 111, 167, 23, 16603, 254, 4, 3]
▁ : 17
ry : 844
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
a : 101
an : 262
▁http : 4538
:// : 4315
bit : 2802
. : 9
ly : 111
/ : 167
s : 23
nj : 16603
en : 254
<sep> : 4
<cls> : 3
ryaaaaaaaaaaaaan http://bit.ly/snjen


In [0]:
tokenizer.encode('<cls>')

[3, 4, 3]

In [0]:
PAD_ID = 5
SEED = 88888
LABEL_SMOOTHING = 0.1
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [0]:
MAX_LEN = 100
special_token = {'<sep>':4 ,'<cls>':3 ,'<pad>':5}
positive = tokenizer.encode('positive')[:-2]
negative = tokenizer.encode('negative')[:-2]
neutral = tokenizer.encode('neutral')[:-2]
sent_tokens  ={'positive': positive ,'negative':negative ,'neutral' : neutral}
print(sent_tokens['positive'])

[1654]


## Making up training data

In [0]:
ct = train.shape[0]
input_ids = np.zeros((ct ,MAX_LEN))
attention_mask = np.zeros((ct ,MAX_LEN))
token_type_ids = np.zeros((ct ,MAX_LEN))
start_tokens = np.zeros((ct ,MAX_LEN))
end_tokens = np.zeros((ct ,MAX_LEN))
counter = 0
for k in range(ct):
    text = train.loc[k ,'text']
    text = " " + " ".join(text.split())
    selected_text = train.loc[k ,'selected_text']
    selected_text = " ".join(selected_text.split())
    sent = train.loc[k ,'sentiment']
    text_enc = tokenizer.encode(text)[:-2]
    selected_enc = tokenizer.encode(selected_text)[:-2]
    idx = text.find(selected_text)
    chars = np.zeros((len(text)))
    chars[idx:idx+len(selected_text)] = 1
    if text[idx-1] == ' ':chars[idx-1] =1
    
    offsets = []
    idx = 0
    for en in text_enc:
        token = tokenizer._convert_id_to_token(en)
        offsets.append((idx ,idx+len(token)))
        idx += len(token)
    toks = []
    for i ,(a ,b) in enumerate(offsets):
        if np.sum(chars[a : b]) >0 :
            toks.append(i)
    sp = special_token['<sep>']
    cl = special_token['<cls>']
    pad = special_token['<pad>']
    enc_final =  [cl] + sent_tokens[sent] + [sp ,sp] + text_enc + [sp] 
    input_ids[k ,:] = enc_final + (MAX_LEN - len(enc_final))*[PAD_ID]
    attention_mask[k ,:] = len(enc_final)*[1] + (MAX_LEN-len(enc_final))*[0]
    token_type_ids[k ,3:len(enc_final)] = 1
    
    if len(toks)>0:
        start_tokens[k ,toks[0]+4] = 1
        end_tokens[k ,toks[-1]+4] = 1 
    if k == 2:
        print(start_tokens[k])
        print(end_tokens[k])
        print(text)
        print(input_ids[k])
        print(selected_text)
        a = np.argmax(start_tokens[k])
        b = np.argmax(end_tokens[k])
        man = tokenizer.encode(text)[:-2]
        print(tokenizer.decode(man[a-4:b-3]))

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
 my boss is bullying me...
[3.0000e+00 2.9810e+03 4.0000e+00 4.0000e+00 9.4000e+01 5.6430e+03
 2.7000e+01 2.3175e+04 1.1000e+02 9.0000e+00 9.0000e+00 9.0000e+00
 4.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00
 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00
 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00 5.0000e+00
 5.0000e+00 5.0000e+00 5.0

In [0]:
print(train.loc[314 ,'text'])
print(input_ids[314])

 
[3.000e+00 9.201e+03 4.000e+00 4.000e+00 4.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00 5.000e+00
 5.0

## Making up test data

In [0]:
ct_t = test.shape[0]
input_ids_t = np.zeros((ct_t ,MAX_LEN))
attention_mask_t = np.zeros((ct_t ,MAX_LEN))
token_type_ids_t = np.zeros((ct_t ,MAX_LEN))
for k in range(ct_t):
    text = test.loc[k ,'text']
    text = " " + " ".join(text.split())
    sent = test.loc[k ,'sentiment']
    text_enc = tokenizer.encode(text)[:-2]
    sp = special_token['<sep>']
    cl = special_token['<cls>']
    pad = special_token['<pad>']
    enc_final = [cl] + sent_tokens[sent] + [sp ,sp] + text_enc + [sp] 
    input_ids_t[k ,:] = enc_final + (MAX_LEN - len(enc_final))*[PAD_ID]
    attention_mask_t[k ,:] = len(enc_final)*[1] + (MAX_LEN-len(enc_final))*[0]
    token_type_ids_t[k ,3:len(enc_final)] = 1

## Building Model , loss ,metric and some helper functions

In [0]:
def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)


def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.binary_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss
def loss(gamma):
    def myloss(y_true ,y_pred):
        ll = tf.shape(y_pred)[1]
        y_true = y_true[:, :ll]
        loss = ((1-y_pred)**gamma)*y_true*K.log(y_pred) + (y_pred**gamma)*(1-y_true)*K.log(1-y_pred)
        return -tf.reduce_mean(loss)
    return myloss
def build_model():
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased' ,do_lower_case = True)
    xlnet = TFXLNetModel.from_pretrained('xlnet-base-cased')
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)
    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]
    x = xlnet(ids_,attention_mask=att_,token_type_ids=tok_)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    model = tf.keras.models.Model(inputs = [ids ,att ,tok] ,outputs = [x1 ,x2])
    print('here1')
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
    model.compile(loss=loss(1.5), optimizer=optimizer)
    print('here2')
    # this is required as `model.predict` needs a fixed size!
    x1_padded = tf.pad(x1, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.0)
    x2_padded = tf.pad(x2, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.0)
    print('here3')
    padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    return model ,padded_model

In [0]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

## Training Phase

In [0]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))
EPOCHS = 4
LABEL_SMOOTHING = 0.1
BATCH_SIZE = 32
skf = StratifiedKFold(n_splits=5,shuffle=True ,random_state=SEED) #originally 5 splits
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model, padded_model = build_model()
        
    #sv = tf.keras.callbacks.ModelCheckpoint(
    #    '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
    #    save_weights_only=True, mode='auto', save_freq='epoch')
    inpT = [input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]]
    targetT = [start_tokens[idxT,], end_tokens[idxT,]]
    inpV = [input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]]
    targetV = [start_tokens[idxV,], end_tokens[idxV,]]
    # sort the validation data
    shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda k: (inpV[0][k] == PAD_ID).sum(), reverse=True))
    inpV = [arr[shuffleV] for arr in inpV]
    targetV = [arr[shuffleV] for arr in targetV]
    weight_fn = '%s-XLNet-%i.h5'%(VER,fold)
    for epoch in range(1, EPOCHS + 1):
        # sort and shuffle: We add random numbers to not have the same order in each epoch
        shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda k: (inpT[0][k] == 5.0).sum() + np.random.randint(-3, 3), reverse=True))
        # shuffle in batches, otherwise short batches will always come in the beginning of each epoch
        num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
        batch_inds = np.random.permutation(num_batches)
        shuffleT_ = []
        for batch_ind in batch_inds:
            shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
        shuffleT = np.concatenate(shuffleT_)
        # reorder the input data
        inpT = [arr[shuffleT] for arr in inpT]
        targetT = [arr[shuffleT] for arr in targetT]
        model.fit(inpT, targetT, 
            epochs=epoch, initial_epoch=epoch - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[],
            validation_data=(inpV, targetV), shuffle=False)  # don't shuffle in `fit`
        save_weights(model, '/content/gdrive/My Drive/' + weight_fn)

    print('Loading model...')
    # model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    load_weights(model, '/content/gdrive/My Drive/' + weight_fn)

    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = padded_model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = padded_model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.loc[k,'text'].split())
            enc = tokenizer.encode(text1)[:-2]
            st = tokenizer.decode(enc[a-4:b-3])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################
here1
here2
here3
Epoch 2/2
Epoch 3/3
Epoch 4/4
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 1 Jaccard = 0.7018650389643601

#########################
### FOLD 2
#########################
here1
here2
here3
118/688 [====>.........................] - ETA: 1:30 - loss: 0.1445 - activation_loss: 0.0771 - activation_1_loss: 0.0673

In [0]:
print(jac)
print('mean jac {}'.format(sum(jac)/len(jac)))

[0.6985422242177061, 0.7037288150573154, 0.708220014204697, 0.6951575595107675, 0.6999276029364913]
mean jac 0.7011152431853954


## Evaluation

In [0]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc[a-2:b-1])
    all.append(st)

In [0]:
test['selected_text'] = all
test[['textID','selected_text']].to_csv('/content/gdrive/My Drive/tweet sentiment/submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
866,41a96b7954,me because I might not have enough money for college!!!,negative,i might not have enough money for college!!!
2123,3da6270b02,I hide my berry like a slave REGULARLY only today I was...,neutral,i hide my berry like a slave regularly only today i was ...
3355,32cde6dbc5,in school w. linda doing nothing ;i miss you,negative,miss you
2599,12ceb7565a,Outlook not so good,negative,not so good
1169,7240ccb4b2,"True to form, Bank Holiday Monday looks like it might be...",positive,hope
2857,16fd300910,Ive been passed out drunk for the passed couple of hours...,neutral,ive been passed out drunk for the passed couple of hours...
2071,e0264b2a5f,I miss you,negative,i miss you
415,52ad93b3fe,Oh No!!!! I must be gettin old!!!! My mom use to watch t...,negative,miss
1768,6c18eab109,I cant change my profile picture on Facebook,neutral,i cant change my profile picture on facebook
323,5aa8a5280f,I knooww & my hot water bottle iss in whangamata witho...,neutral,i knooww & my hot water bottle iss in whangamata without...


In [0]:
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
3428,e4c90cafae,"I`m mo nudge you again, better watch out!",negative,better watch out!
1220,aa221b6a7d,"hey, what about us followers in ATL!!!!",neutral,"hey, what about us followers in atl!!!!"
2611,e2c48f0201,"Slept in, woke up with an iced coffee, lazed about & wen...",neutral,"slept in, woke up with an iced coffee, lazed about & wen..."
3111,c780b84d97,"yea - it`s mostly b/c I couldn`t sleep, but oh well, lu...",neutral,"yea - it`s mostly b/c i couldn`t sleep, but oh well, lun..."
3292,a35314257a,Edgefest! or maybe since you`re driving...,neutral,edgefest! or maybe since you`re driving...
605,9a0fce6d7e,learning how to use twitter,neutral,learning how to use twitter
2151,94e0366b6d,http://twitpic.com/4u5h8 - leon looks supa` fly on that...,positive,leon looks supa`
2857,16fd300910,Ive been passed out drunk for the passed couple of hours...,neutral,ive been passed out drunk for the passed couple of hours...
2031,b303da170c,I am going to die tomorrow night. should be here.,negative,die
300,8f0bafc9dc,"In effect, your podcast IS the audio version of your ...",neutral,"in effect, your podcast is the audio version of your boo..."
