In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import re
import pickle
import gc
import nltk
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from transformers import *
import tokenizers
import matplotlib.pyplot as plt
print('TF version',tf.__version__)

TF version 2.1.0


In [2]:
MAX_LEN = 96
data = pd.read_csv("../input/tweet-sentiment-extraction/train.csv")
data.dropna(how="any", inplace=True)
#Attempt reproducibility
tf.random.set_seed(1)
np.random.seed(1)

In [3]:
def slang_cleaner(word):
    regex = re.compile(r'(\w*)(\w)\2(\w*)')
    replace = r'\1\2\3'
    if word=="": return word
    if word=="<3": return "LOVE" #Independent emoticon disambiguation
    for i in range(len(word)):
        if word[i] in ["!",".","?",":",",","`","-","=",";"]: return slang_cleaner(
            word[:i])+word[i]+slang_cleaner(word[i+1:])
    if nltk.corpus.wordnet.synsets(word): return word
    clean = regex.sub(replace, word)
    if (word==clean): return word
    else: return slang_cleaner(clean)
def cleaner(sent):
    #Two tokens (WEBSITE, VULGAR) are created and punctuation is spaced out
    sent = " ".join(np.vectorize(lambda s:"WEBSITE" if "http" in s or ("www" in s and "com" in s) else s)
                    (np.array(sent.split())))
    for punc in ["\!","\.","\?","\:","\,","\`","\-","\=","\;"]:
        sent = re.sub(re.compile('(?:'+punc+'){2,}'),punc[1],sent)
    sent = re.sub("[`]","\'",sent)
    sent = re.sub(re.compile('(?:\*){2,}'),"VULGAR",sent)
    return (" ".join(np.vectorize(slang_cleaner)(np.array(sent.split())))).lower()
for col in ['text','selected_text']:
    data[col] = data[col].apply(lambda s: cleaner(s))
data = data.loc[data.text.apply(lambda s: len(s))!=1]

In [4]:
#Validity Checks
def validitycheck(text, subtext):
    breaks = ["!",".","?",":",",","`","-","=",";"," "]
    startpos = text.find(subtext)
    endpos = startpos+len(subtext)
    if startpos<0: return False
    if (startpos!=0) and (text[startpos]!=" ") and (text[startpos-1] not in breaks): return False
    if (endpos!=len(text))  and (text[endpos] not in breaks) and (text[endpos-1] not in breaks): return False
    return True
data = data.loc[data.apply(lambda s: validitycheck(s.text, s.selected_text), axis=1)].reset_index(drop=True)

In [5]:
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

<h2> Modelling <h2>

In [6]:
def generate_labelling(info, training):
    input_ids = np.ones((info.shape[0],MAX_LEN),dtype='int32')
    attention_mask = np.zeros((info.shape[0],MAX_LEN),dtype='int32')
    token_type_ids = np.zeros((info.shape[0],MAX_LEN),dtype='int32')
    start_tokens = np.zeros((info.shape[0],MAX_LEN),dtype='int32')
    end_tokens = np.zeros((info.shape[0],MAX_LEN),dtype='int32')
    for k in range(info.shape[0]):
        text1 = " "+" ".join(info.loc[k,'text'].split())
        enc = tokenizer.encode(text1) 
        offsets = []; idx=0
        for t in enc.ids:
            w = tokenizer.decode([t])
            offsets.append((idx,idx+len(w)))
            idx += len(w)
        s_tok = sentiment_id[info.loc[k,"sentiment"]]
        input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
        attention_mask[k,:len(enc.ids)+5] = 1
        if training:
            text2 = " ".join(info.loc[k,'selected_text'].split())
            idx = text1.find(text2)
            chars = np.zeros((len(text1)))
            chars[idx:idx+len(text2)]=1
            if text1[idx-1]==' ': chars[idx-1] = 1
            toks = []
            for i,(a,b) in enumerate(offsets):
                sm = np.sum(chars[a:b])
                if sm>0: toks.append(i)
            if len(toks)>0:
                start_tokens[k,toks[0]+1] = 1
                end_tokens[k,toks[-1]+2] = 1
    return input_ids, attention_mask, token_type_ids, start_tokens, end_tokens

In [7]:
def label_smoothener(tokenspre):
    import scipy.stats as stats
    def activated_gauss(s):
        loc = s.values[-1]
        mod = s.apply(lambda g: stats.norm.pdf(g, loc=loc, scale=0.5))
        return mod[:-1]/np.sum(mod[:-1])
    tokens = pd.DataFrame(np.tile(np.arange(tokenspre.shape[1]), (tokenspre.shape[0],1)))
    tokens['info' ] = np.argmax(tokenspre, axis=1)
    tokens = tokens.apply(lambda s: activated_gauss(s), axis=1)
    return tokens.values

In [8]:
#Distance-Based Entropy
def distance_weighted_categorical_crossentropy(y_true, y_pred):
    crossentropy = tf.reduce_sum(tf.math.multiply(tf.math.log(y_pred), tf.cast(y_true, dtype=tf.float32)), axis=-1)
    distance = tf.abs(tf.math.argmax(y_true, axis=-1)-tf.argmax(y_pred, axis=-1))+1
    return -tf.reduce_sum(tf.math.multiply(tf.cast(distance, dtype=tf.float32), crossentropy))
#CDF-Loss
def CDF_loss(y_true, y_pred):
    truelabel = tf.math.cumsum(tf.cast(y_true, dtype=y_pred.dtype), axis=1)
    predlabel = tf.math.cumsum(y_pred, axis=1)
    return tf.reduce_sum((truelabel-predlabel)**2)

In [9]:
def build_model(drp=0.1, l2reg=0.00, activation=None, kinit="glorot_uniform"):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    config = RobertaConfig.from_pretrained(PATH+"config-roberta-base.json")
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    x1 = tf.keras.layers.GaussianDropout(drp)(x[0])
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.GaussianDropout(drp)(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    x2 = tf.keras.layers.GaussianDropout(drp)(x[0])
    x2 = tf.keras.layers.Conv1D(128, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.GaussianDropout(drp)(x2)
    x2 = tf.keras.layers.Conv1D(64, 2,padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    return model

In [10]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train = data.copy().reset_index(drop=True)
input_ids, attention_mask, token_type_ids, start_tokens, end_tokens = generate_labelling(train, True)
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
for i,(train_index,test_index) in enumerate(cv.split(input_ids,train.sentiment.values)):
    tf.keras.backend.clear_session()
    model = build_model()
    save = tf.keras.callbacks.ModelCheckpoint('roberta-fold%i.h5'%(i), monitor='val_loss', verbose=1, 
                                              save_best_only=True, save_weights_only=True, mode='auto', 
                                              save_freq='epoch')
    model.fit([input_ids[train_index,], attention_mask[train_index,], token_type_ids[train_index,]], 
              [start_tokens[train_index,], end_tokens[train_index,]], epochs=5, batch_size=64, verbose=True, 
              validation_data=([input_ids[test_index,],attention_mask[test_index,],token_type_ids[test_index,]], 
                               [start_tokens[test_index,], end_tokens[test_index,]]),callbacks=[save])

<h2> Predictions on Test Data </h2>

In [12]:
#Neutral Labels
testdata = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
testdata['original_text'] = testdata['text'].copy()
testdata['text'] = testdata['text'].apply(lambda s: cleaner(s))
testdata['selected_text'] = np.nan
testdata.loc[testdata.sentiment=="neutral", "selected_text"] = testdata.loc[
    testdata.sentiment=="neutral", "original_text"]

In [13]:
#SubText Generation Function
def subtext(info, input_ids, attention_mask, token_type_ids, tag):
    begin, end = np.zeros((len(info), MAX_LEN)), np.zeros((len(info), MAX_LEN))
    for fold in range(5):
        tf.keras.backend.clear_session()
        model = build_model()
        model.load_weights(tag+"roberta-fold"+str(fold)+".h5")
        tempbegin, tempend = model.predict([input_ids, attention_mask, token_type_ids])
        begin += tempbegin
        end += tempend
    for k in range(len(info)):
        text1 = " "+" ".join(info.loc[info.index[k],'text'].split())
        enc, st = tokenizer.encode(text1), None
        a = np.argmax(begin[k,])
        b = np.argmax(end[k,])
        if a>=b:
            if np.max(end[k,])>np.max(begin[k,]):
                st = tokenizer.decode(enc.ids[:b-1])
            else:
                st = tokenizer.decode(enc.ids[a-1:])
        else:
            st = tokenizer.decode(enc.ids[a-1:b-1]) 
        info.loc[info.index[k], "selected_text"] = st

In [14]:
#Positive Labels
postestdata = testdata.loc[testdata.sentiment=="positive"].reset_index(drop=True)
input_ids, attention_mask, token_type_ids, _, _ = generate_labelling(postestdata, False)
subtext(postestdata, input_ids, attention_mask, token_type_ids, "")

In [15]:
#Negative Labels
negtestdata = testdata.loc[testdata.sentiment=="negative"].reset_index(drop=True)
input_ids, attention_mask, token_type_ids, _, _ = generate_labelling(negtestdata, False)
subtext(negtestdata, input_ids, attention_mask, token_type_ids, "")

In [16]:
#Cohesion (Start and End Checks)
def startcorrect(fulltext, subtext):
    subtext = " ".join(subtext.split())
    fulltext = " "+" ".join(fulltext.split())+" "
    i = fulltext.find(subtext)
    if i<0: return np.nan
    k = i
    breaks = ["!",".","?",":",",","`","-","=",";"," "]
    if fulltext[i] in breaks: return subtext
    while (k!=0) and (fulltext[k-1] not in breaks): k-=1
    return fulltext[k:i]+subtext
def endcorrect(fulltext, subtext):
    subtext = " ".join(subtext.split())
    fulltext = " "+" ".join(fulltext.split())+" "
    i = fulltext.find(subtext)
    if i<0: return np.nan
    k=i+len(subtext)
    breaks = ["!",".","?",":",",","`","-","=",";"," "]
    if fulltext[k-1] in breaks: return subtext
    while (k!=len(fulltext)) and (fulltext[k] not in breaks): k+=1
    return subtext+fulltext[i+len(subtext):k]

In [17]:
#Cohesion (Clean to Preprocessed Conversion)
def reverseprocessing(maintext, pretext, subtext):
    maintext = " ".join(maintext.split())
    pretext = " ".join(pretext.split())
    subtext = " ".join(subtext.split())
    if subtext=="": return subtext
    startpos = maintext.find(subtext)
    endpos = startpos+len(subtext)
    breakstart_inclpunc, breakend_inclpunc, breakstart_nopunc, breakend_nopunc = False, False, False, False
    puncs = ["!",".","?",":",",","`","-","=",";"]
    if (startpos!=0):
        if (maintext[startpos-1]!=" "):
            if maintext[startpos] in puncs:
                breakstart_inclpunc = True
            else:
                breakstart_nopunc = True
            startpos = len(maintext[:startpos].split())-1
        else:
            startpos = len(maintext[:startpos].split())
    if (endpos!=len(maintext)) and (maintext[endpos]!=" "):
        if maintext[endpos-1] in puncs:
            breakend_inclpunc = True
        else:
            breakend_nopunc = True
    endpos  = len(maintext[:endpos].split())
    returntext = pretext.split()[startpos:endpos]
    if breakstart_inclpunc:
        for i in range(len(returntext[0])):
            if returntext[0][i] in puncs:
                returntext[0] = returntext[0][i:]
                break
    elif breakstart_nopunc:
        sawpunc = False
        for i in range(len(returntext[0])):
            if returntext[0][i] in puncs:
                sawpunc = True
            if sawpunc and (returntext[0][i] not in puncs):
                returntext[0] = returntext[0][i:]
                break
    if breakend_inclpunc:
        sawpunc = False
        for i in range(len(returntext[-1])):
            if returntext[-1][i] in puncs:
                sawpunc = True
            if sawpunc and (returntext[-1][i] not in puncs):
                returntext[-1] = returntext[-1][:i]
                break
    elif breakend_nopunc:
        for i in range(len(returntext[-1])):
            if returntext[-1][i] in puncs:
                returntext[-1] = returntext[-1][:i]
                break
    return " ".join(returntext)

In [18]:
#Consolidation
postestdata['selected_text']=postestdata.apply(lambda s: endcorrect(s.text, startcorrect(
    s.text, s.selected_text)), axis=1)
negtestdata['selected_text']=negtestdata.apply(lambda s: endcorrect(s.text, startcorrect(
    s.text, s.selected_text)), axis=1)
postestdata['selected_text']=postestdata.apply(lambda s: reverseprocessing(s.text, s.original_text, 
                                                                           s.selected_text), axis=1)
negtestdata['selected_text']=negtestdata.apply(lambda s: reverseprocessing(s.text, s.original_text, 
                                                                           s.selected_text), axis=1)
testdata = testdata.merge(postestdata[['textID','selected_text']], on="textID", how="left").merge(
    negtestdata[['textID', 'selected_text']], on="textID", how="left")
testdata['selected_text'] = testdata.apply(lambda s: pd.Series([s.selected_text_x, s.selected_text_y, 
                                                                s.selected_text]).dropna().values[0], axis=1)
testdata.drop(columns=["selected_text_x", "selected_text_y"], inplace=True)

In [19]:
testdata[["textID","selected_text"]].to_csv("submission.csv", index=False)