In [None]:
!pip install transformers

## Importing dependencies

In [None]:
from transformers import *
import numpy as np
import tensorflow as tf
import keras
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import random
import tensorflow.keras.backend as K
import sentencepiece as spm
from tensorflow.keras.optimizers import Adam

## Reading train csv file

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna(' ')
train.head()

## getting XLnet and alBERT
I am using same tokenizer for both of these...because they are trained on same tokenizer(sentencepiece tokenizer).There might be minor changes but as a whole I hope it will not creat ebig mess.

In [None]:
config = XLNetConfig.from_pretrained('../input/xlnet-base-tf/xlnet-base-cased-config.json')
tokenizer = XLNetTokenizer.from_pretrained('../input/xlnet-base-tf/xlnet-base-cased-spiece.model' ,do_lower_case = True)
xlnet = TFXLNetModel.from_pretrained('../input/xlnet-base-tf/xlnet-base-cased-tf_model.h5',config=config)

In [None]:
config = AlbertConfig.from_pretrained('../input/albert-base-v2-tf2/config.json')
albert = TFAlbertModel.from_pretrained('../input/albert-base-v2-tf2/tf_model.h5',config=config)

### prints a random tokenized sentence

In [None]:
k = random.randrange(train.shape[0])
example = train.loc[k ,'text']
enc = tokenizer.encode(example)
print('statement is \'{}\''.format(example))
print('encoding is {}'.format(enc))
sentence = ''
for en in enc:
    token = tokenizer._convert_id_to_token(en)
    print('{} : {}'.format(token ,en))
    if token != '<sep>' and token != '<cls>':
        sentence = sentence + token
sentence = sentence.replace('▁' ," ").strip()
print(sentence)

In [None]:
tokenizer.encode('<pad>')

## Setting parameters

In [None]:
MAX_LEN = 100
special_token = {'<sep>':4 ,'<cls>':3 ,'<pad>':5}
positive = tokenizer.encode('positive')[:-2]
negative = tokenizer.encode('negative')[:-2]
neutral = tokenizer.encode('neutral')[:-2]
sent_tokens  ={'positive': positive ,'negative':negative ,'neutral' : neutral}
print(sent_tokens['positive'])

In [None]:
ct = train.shape[0]
token_ids = np.zeros((ct ,MAX_LEN))
attention_masks = np.zeros((ct ,MAX_LEN))
token_type_ids = np.zeros((ct ,MAX_LEN))
start_token = np.zeros((ct ,MAX_LEN))
end_token = np.zeros((ct ,MAX_LEN))
counter = 0
for k in range(ct):
    try:
        text = train.loc[k ,'text']
        text = " " + " ".join(text.split())
        selected_text = train.loc[k ,'selected_text']
        selected_text = " ".join(selected_text.split())
        sent = train.loc[k ,'sentiment']
        text_enc = tokenizer.encode(text)[:-2]
        selected_enc = tokenizer.encode(selected_text)[:-2]
        idx = text.find(selected_text)
        chars = np.zeros((len(text)))
        chars[idx:idx+len(selected_text)] = 1
        if text[idx-1] == ' ':chars[idx-1] =1
    
        offsets = []
        idx = 0
        for en in text_enc:
            token = tokenizer._convert_id_to_token(en)
            offsets.append((idx ,idx+len(token)))
            idx += len(token)
        toks = []
        for i ,(a ,b) in enumerate(offsets):
            if np.sum(chars[a : b]) >0 :
                toks.append(i)
        sp = special_token['<sep>']
        cl = special_token['<cls>']
        pad = special_token['<pad>']
        enc_final = [cl] + text_enc + [sp ,sp] + sent_tokens[sent] + [sp]
        token_ids[k ,:] = enc_final + (MAX_LEN - len(enc_final))*[0]
        attention_masks[k ,:] = len(enc_final)*[1] + (MAX_LEN-len(enc_final))*[0]
        token_type_ids[k ,len(enc_final)-2:len(enc_final)] = 1
        start_token[k ,toks[0] + 1] = 1
        end_token[k ,toks[-1] + 1] = 1 
    except IndexError:
        token_ids[k ,:] = 0
        attention_masks[k ,:] = 0
        token_type_ids[k ,:] = 0
        start_token[k ,0] = 1
        end_token[k ,0] = 1
    if k == 2:
        print(start_token[k])
        print(end_token[k])
        print(text)
        print(token_ids[k])
        print(selected_text)
        a = np.argmax(start_token[k])
        b = np.argmax(end_token[k])
        man = tokenizer.encode(text)[:-2]
        print(tokenizer.decode(man[a-1:b]))

In [None]:
print(train.loc[314 ,'text'])
print(token_ids[314])

In [None]:
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna(' ')
ct_t = test.shape[0]

In [None]:
MAX_LEN = 100
token_ids_t = np.zeros((ct_t ,MAX_LEN))
attention_masks_t = np.zeros((ct_t ,MAX_LEN))
token_type_ids_t = np.zeros((ct_t ,MAX_LEN))
for k in range(ct_t):
    text = test.loc[k ,'text']
    text = " " + " ".join(text.split())
    sent = test.loc[k ,'sentiment']
    text_enc = tokenizer.encode(text)[:-2]
    sp = special_token['<sep>']
    cl = special_token['<cls>']
    pad = special_token['<pad>']
    enc_final = [cl] + text_enc + [sp ,sp] + sent_tokens[sent] + [sp] 
    token_ids_t[k ,:] = enc_final + (MAX_LEN - len(enc_final))*[0]
    attention_masks_t[k ,:] = len(enc_final)*[1] + (MAX_LEN-len(enc_final))*[0]
    token_type_ids_t[k ,len(enc_final)-3:len(enc_final)] = 1

## Building Model

In [None]:
def build_model(mod):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    x = mod(ids,attention_mask=att,token_type_ids=tok)

    drop1 = tf.keras.layers.Dropout(0.1)(x[0])
    #layer1 = tf.keras.layers.Conv1D(128 ,kernel_size = 2 ,padding = 'same')(drop1)
    #layer1 = tf.keras.layers.LeakyReLU()(layer1)
    layer2 = tf.keras.layers.Conv1D(64 ,kernel_size = 1)(drop1)
    layer2 = tf.keras.layers.LeakyReLU()(layer2)
    layer3 = tf.keras.layers.Dense(1)(layer2)
    layer4 = tf.keras.layers.Flatten()(layer3)
    output_1 = tf.keras.layers.Activation('softmax')(layer4)

    drop1_ = tf.keras.layers.Dropout(0.2)(x[0])
    #layer1_ = tf.keras.layers.Conv1D(128 ,kernel_size = 2 ,padding = 'same')(drop1_)
    #layer1_ = tf.keras.layers.LeakyReLU()(layer1_)
    layer2_ = tf.keras.layers.Conv1D(64 ,kernel_size = 1 )(drop1_)
    layer2_ = tf.keras.layers.LeakyReLU()(layer2_)
    layer3_ = tf.keras.layers.Dense(1)(layer2_)
    layer4_ = tf.keras.layers.Flatten()(layer3_)
    output_2 = tf.keras.layers.Activation('softmax')(layer4_)
    model = tf.keras.Model(inputs = [ids ,att ,tok] ,outputs = [output_1 ,output_2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.000001)
    model.compile(loss = my_loss(1.5) ,optimizer = optimizer)
    return model

## Defining loss and metrics

In [None]:
def my_loss(gamma):
    '''defining focal loss with gamma and alpha parameters'''
    def focal_loss(y_true ,y_pred):
        y_true = tf.cast(y_true ,dtype = tf.float32)
        y_pred = tf.cast(y_pred ,dtype = tf.float32)
        log_lik = ((1-y_pred)**gamma)*y_true*K.log(y_pred) + (y_pred**gamma)*(1-y_true)*K.log(1-y_pred)
        return -K.sum(log_lik ,axis = -1)
    return focal_loss
def category(y_true ,y_pred):
    y_pred = tf.keras.backend.clip(y_pred ,1e-7 ,1-1e-7)
    return tf.keras.losses.binary_crossentropy(y_true ,y_pred)
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
def scheduler(epoch):
    return 3e-5 * 0.2**epoch

## Training xlnet

In [None]:
model1 = build_model(xlnet)
model1.compile(optimizer = Adam(lr = 3e-5) ,loss = my_loss(1.5))
reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)
hist = model1.fit([token_ids[800: ,], attention_masks[800: ,], token_type_ids[800:,]], [start_token[800:,], end_token[800:,]], 
        epochs=3, batch_size=8, verbose=1, callbacks=[reduce_lr],
        validation_split = 0.1)
    

In [None]:
'''model1.save_weights('xlnet.h5')'''

## Training alBERT

In [None]:
model2 = build_model(albert)
model2.compile(optimizer = Adam(lr = 3e-5) ,loss = my_loss(1.5))
reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)
hist = model2.fit([token_ids[800: ,], attention_masks[800: ,], token_type_ids[800:,]], [start_token[800:,], end_token[800:,]], 
        epochs=6, batch_size=8, verbose=1, callbacks=[reduce_lr],
        validation_split = 0.1)

In [None]:
#model2.save_weights('albert.h5')

In [None]:
#model1.load_weights('xlnet.h5')
#model2.load_weights('albert.h5')

## XLNet predections

In [None]:
oof_start1 = np.zeros((token_ids.shape[0],MAX_LEN))
oof_end1 = np.zeros((token_ids.shape[0],MAX_LEN))
oof_start1,oof_end1 = model1.predict([token_ids[:800],attention_masks[:800],token_type_ids[:800]],verbose=1)

## alBERT predictions

In [None]:
oof_start2 = np.zeros((token_ids.shape[0],MAX_LEN))
oof_end2 = np.zeros((token_ids.shape[0],MAX_LEN))
oof_start2,oof_end2 = model2.predict([token_ids[:800],attention_masks[:800],token_type_ids[:800]],verbose=1)

## Combining predictions

In [None]:
oof_start = np.zeros((token_ids.shape[0],MAX_LEN))
oof_end = np.zeros((token_ids.shape[0],MAX_LEN))
oof_start,oof_end = (oof_start1+oof_start2)/2 ,(oof_end1+oof_end2)/2

## Calculating jaccard

In [None]:
all = []
jac = []
counter = 0
m = random.randrange(800)
for k in range(800):
    a = np.argmax(oof_start[k,])
    b = np.argmax(oof_end[k,])
    if a>b: 
        st = train.loc[k,'text']
        # IMPROVE CV/LB with better choice here
        all.append(jaccard(st,train.loc[k,'selected_text']))
    else:
        text1 = " "+" ".join(train.loc[k,'text'].split())
        enc = tokenizer.encode(text1)[:-2]
        st = tokenizer.decode(enc[a-1:b])
        if k==8:
            print(oof_start[k ,])
            print(oof_end[k ,])
            print(a ,b)
            print(st)
            print(text1)
            print(train.loc[k ,'selected_text'])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    
jac.append(np.mean(all))

In [None]:
print(np.mean(jac)) 

## Kaggle submission

In [None]:
preds1 = model1.predict([token_ids_t ,attention_masks_t ,token_type_ids_t] ,verbose = 1)
preds2 = model2.predict([token_ids_t ,attention_masks_t ,token_type_ids_t] ,verbose = 1)
preds_start = (preds1[0] + preds2[0])/2
preds_end = (preds1[1] + preds2[1])/2

In [None]:
all = []
for k in range(token_ids_t.shape[0]):
    a = np.argmax([preds_start[k ,]])
    b = np.argmax(preds_end[k ,])
    if a>b:
        st = test.loc[k ,'text']
    else:
        text1 = " " + " ".join(test.loc[k ,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc[a-1:b])
    all.append(st)

In [None]:
test['selected_text'] = all
test[['textID' ,'selected_text']].to_csv('submission.csv' ,index = False)
pd.set_option('max_colwidth' ,60)
test.sample(25)