# Importing all dependencies 

In [None]:
from transformers import *
import tokenizers
import numpy as np
import zipfile
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
import math
import pickle

## This is to create a strategy from tpu

In [None]:
# Create strategy from tpu
AUTO = tf.data.experimental.AUTOTUNE
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Loading tokenizers and roberta
### The weights files, vocab and merge files are available at [huggingface transformers](https://huggingface.co/transformers/_modules/transformers/modeling_tf_roberta.html#TFRobertaModel)

In [None]:
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file='../input/tf-roberta/vocab-roberta-base.json', 
    merges_file='../input/tf-roberta/merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
config = RobertaConfig.from_pretrained('../input/tf-roberta/config-roberta-base.json')
bert_model = TFRobertaModel.from_pretrained('../input/tf-roberta/pretrained-roberta-base.h5',config=config)

## Reading the training csv with pandas.read_csv

In [None]:
train_csv = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip' ,sep = '\t')
train_csv.head(50)

## These are the parameters that I used for the experiments, these can be adjusted to improve the performance

In [None]:
print(train_csv.Sentiment.value_counts())
PAD_ID = 0
MAX_LEN = 80
EPOCHS = 5
DISPLAY = 1
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
LABEL_SMOOTHING = 0.1
print(BATCH_SIZE)

In [None]:
train_csv.shape[0]

## Tokenizing the dataset with roberta tokenizer

In [None]:
ct = train_csv.shape[0]
input_ids = np.zeros((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
sentiments =  np.zeros((ct ,5))
for k in range(train_csv.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train_csv.loc[k,'Phrase'].split())
    enc = tokenizer.encode(text1) 
    target = train_csv.loc[k ,'Sentiment']
    sentiments[k][target] = 1
    input_ids[k,:len(enc.ids)+2] = [0] + enc.ids + [2]
    attention_mask[k ,:len(enc.ids)+2] = 1
    if k == 2:
        print(text1)
        print(input_ids[k])
        man = tokenizer.encode(text1)
        print(target)
        print(sentiments[k])

## Tokenizing test data

In [None]:
test = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip' ,sep = '\t')

ct = test.shape[0]
input_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'Phrase'].split())
    enc = tokenizer.encode(text1)              
    attention_mask_t[k ,:len(enc.ids)+2] = 1
    input_ids_t[k,:len(enc.ids)+2] = [0] + enc.ids + [2]

## Building a CNN head, focal loss and some helper functions

In [None]:
def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)


def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model
def focal_loss(alpha ,gamma):
    def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
        y_pred = K.clip(y_pred ,1e-7 ,1-1e-7)
        loss = -alpha*((1-y_pred)**gamma)*y_true*K.log(y_pred)
        loss = K.sum(loss ,axis = -1)
        return loss
    return loss_fn
def scheduler(epoch):
    return 3e-5 * 1.15**epoch

def build_model(alpha ,gamma):

    config = RobertaConfig.from_pretrained('../input/tf-roberta/config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained('../input/tf-roberta/pretrained-roberta-base.h5',config=config)
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    x = bert_model(ids)
    #x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(2048, 2,padding='same')(x[0])
    x1 = tf.keras.layers.LeakyReLU()(x1)
    #x1 = tf.keras.layers.Dropout(0.1)(x1)
    x1 = tf.keras.layers.Conv1D(1024, 2)(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    #x1 = tf.keras.layers.Dropout(0.1)(x1)
    x1 = tf.keras.layers.Conv1D(512, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    #x1 = tf.keras.layers.Dropout(0.1)(x1)
    x1 = tf.keras.layers.Conv1D(256, 2)(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    #x1 = tf.keras.layers.Dropout(0.1)(x1)
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Dense(5)(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    

    model = tf.keras.models.Model(inputs=[ids ,att], outputs=[x1])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001) 
    model.compile(loss=focal_loss(alpha ,gamma), optimizer=optimizer ,metrics = ['acc'])
    
    return model

## Using K-Fold cross-validation training startegy

In [None]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof = np.zeros((input_ids.shape[0],5))
preds = np.zeros((input_ids_t.shape[0],5))

skf = StratifiedKFold(n_splits=6,shuffle=True) #,random_state=SEED) #originally 5 splits
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train_csv.Sentiment)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    with strategy.scope():
        model = build_model(2.0 ,1.5)
        
    #sv = tf.keras.callbacks.ModelCheckpoint(
    #    '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
    #    save_weights_only=True, mode='auto', save_freq='epoch')
    inpT = [input_ids[idxT ,] ,attention_mask[idxT ,]]
    targetT = [sentiments[idxT ,]]
    inpV = [input_ids[idxV,] ,attention_mask[idxV ,]]
    targetV = [sentiments[idxV ,]]
    train_dataset = (tf.data.Dataset
    .from_tensor_slices(({'input1' : inpT[0],'input2':inpT[1]}, targetT[0]))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

    valid_dataset = (tf.data.Dataset
    .from_tensor_slices(({'input1' : inpV[0],'input2':inpV[1]}, targetV[0]))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

    test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(({'input1' : input_ids_t,'input2': attention_mask_t}))
    .batch(BATCH_SIZE)
)
    n_steps = len(idxT)//BATCH_SIZE
    # sort the validation data
    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)
    model.fit(inpT ,targetT[0] ,epochs = EPOCHS ,steps_per_epoch = n_steps ,verbose = DISPLAY ,callbacks = [reduce_lr] ,validation_data = (inpV ,targetV[0]))
    print('Loading model...')
    # model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    weight_fn = 'v0-roberta-{}.h5'.format(fold)
    save_weights(model ,weight_fn)
    load_weights(model, weight_fn)

    print('Predicting OOF...')
    oof[idxV,] = model.predict(inpV ,targetV[0],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds_ = model.predict([input_ids_t ,attention_mask_t],verbose=DISPLAY)
    preds += preds_/skf.n_splits
    mypreds = []
    GT = [] 
    for k in idxV:
        mypreds.append(np.argmax(oof[k,]))
        GT.append(np.argmax(sentiments[k,]))
    print('f1 score : {}'.format(f1_score(GT ,mypreds ,average = 'weighted')))
    print('acc score : {}'.format(accuracy_score(GT ,mypreds)))
    print('cm : {}'.format(confusion_matrix(GT ,mypreds)))