In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch
import torch.nn as nn
import transformers
import tokenizers
from transformers import (BertModel, 
                          BertConfig, 
                          RobertaConfig, 
                          RobertaModel)
from tokenizers import ByteLevelBPETokenizer
import os

%matplotlib inline

In [2]:
bert_config = BertConfig.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")

roberta_config = RobertaConfig.from_pretrained("roberta-base")
roberta_config.output_attentions = False
roberta_config.output_hidden_states = True
roberta_config.output_past = True

roberta = RobertaModel.from_pretrained("roberta-base", config=roberta_config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [5]:
def read_data(datadir):
    train = pd.read_csv(os.path.join(datadir, "train.csv"))
    test = pd.read_csv(os.path.join(datadir, "test.csv"))
    sample_submission = pd.read_csv(os.path.join(datadir, "sample_submission.csv"))

    return (train, test, sample_submission)

In [6]:
train, test, _ = read_data("../data/")

In [7]:
train

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [5]:
pos = train.iloc[27478]
text, st = pos.text, pos.selected_text

print(text)
print(st)

 Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend  Take care hun xxxx
Yay good for both of you.


In [6]:
bpe_tokenizer = ByteLevelBPETokenizer(
            vocab_file="../pretrained/roberta-base-vocab.json",
            merges_file="../pretrained/roberta-base-merges.txt",
            lowercase=True,
            add_prefix_space=True,
        )

roberta_tokenizer = transformers.RobertaTokenizer.from_pretrained("roberta-base")

bert_tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
MAX_LEN = 96

ids = np.ones((1, MAX_LEN), dtype=np.int32)
attention_mask = np.zeros((1, MAX_LEN), dtype=np.int32)
token_type_ids = np.zeros((1, MAX_LEN), dtype=np.int32)

In [8]:
encoded = bpe_tokenizer.encode(text)

# encode_length = 1 + len(encoded.ids) + 2 + 1 + 1

# ids[0,:encode_length] = [0] + encoded.ids + [2, 2] + [1331] + [1]
# attention_mask[0,:encode_length] = [1] * encode_length

In [9]:
e = roberta_tokenizer.build_inputs_with_special_tokens(bpe_tokenizer.encode(text).ids, 
                                                      bpe_tokenizer.encode('positive').ids)

In [10]:
e = roberta_tokenizer.create_token_type_ids_from_sequences(bpe_tokenizer.encode(text).ids, 
                                                      bpe_tokenizer.encode('positive').ids)

In [58]:
MAX_LEN = 96

encoded = bpe_tokenizer.encode(text)

ids = torch.tensor([encoded.ids])
mask = torch.tensor([encoded.attention_mask])
token_type_ids = torch.tensor([encoded.type_ids])

In [59]:
outputs = roberta(ids, attention_mask=mask, token_type_ids=token_type_ids)

In [60]:
lengths = []
for _, row in train.iterrows():
    if isinstance(row.text, str):
        encoded = roberta_tokenizer.encode(row.text)
        lengths += [len(encoded.ids)]

AttributeError: 'list' object has no attribute 'ids'

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12,4))
sns.countplot(lengths, ax=ax)

print(f"Maximum token size: {max(lengths)}")

In [72]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }


In [14]:

class Config:
    datadir = "../data"
    modelsdir = "../models"
    roberta_vocab = "../pretrained/roberta-base-vocab.json"
    roberta_merges = "../pretrained/roberta-base-merges.txt"


def initialize_tokenizer():
    print("Tokenizer getting loaded...")

    tokenizer = ByteLevelBPETokenizer(
        vocab_file=Config.roberta_vocab,
        merges_file=Config.roberta_merges,
        lowercase=True,
        add_prefix_space=True)

    vocab_size = tokenizer.get_vocab_size()
    print(f"Vocab size = {vocab_size:,}")

    return tokenizer

In [15]:
tokenizer = initialize_tokenizer()

Tokenizer getting loaded...
Vocab size = 50,265


In [9]:
class TweetModel(transformers.BertModel):
    def __init__(self, config):
        super(TweetModel, self).__init__(config)
        print("Importing model...")
        self.roberta = transformers.RobertaModel.from_pretrained("roberta-base", config=config)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(config.hidden_size*2, 2)

    def forward(self, ids, attention_mask, token_type_ids):
        _, _, out = self.roberta(
                        ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.dropout(out)
        logits = self.linear(out)

        start, end = logits.split(1, dim=-1)

        return start.squeeze(-1), end.squeeze(-1)

In [10]:
model = TweetModel(roberta_config)

Importing model...


In [12]:
list(model.linear.parameters())

[Parameter containing:
 tensor([[-0.0020, -0.0073,  0.0193,  ...,  0.0178,  0.0176, -0.0058],
         [ 0.0056,  0.0164,  0.0112,  ...,  0.0035, -0.0208,  0.0130]],
        requires_grad=True),
 Parameter containing:
 tensor([ 0.0026, -0.0124], requires_grad=True)]

In [17]:
roberta_config.output_hidden_states = True
tweetmodel = TweetModel(roberta_config)

In [18]:
MAX_LEN = 96

encoded = bpe_tokenizer.encode(text)

ids = torch.tensor([encoded.ids])
attention_mask = torch.tensor([encoded.attention_mask])
token_type_ids = torch.tensor([encoded.type_ids])

logits = tweetmodel(ids, attention_mask, token_type_ids)

In [19]:
start, end = logits[:,:,0], logits[:,:,1]
ground = torch.zeros_like(start)
ground[:,5] = 1

In [20]:
loss = nn.CrossEntropyLoss()

# Process tweet

In [461]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }


In [455]:
def process_tweet(tweet, selected_text, sentiment, tokenizer, max_len):
    sentiment_ids = {
            'positive': 1313, 
            'negative': 2430, 
            'neutral': 7974
    }
    
    # initializing ids, attention_mask, token_typel_ids
    ids = np.zeros((max_len), dtype=np.int32)
    attention_mask = np.zeros((max_len), dtype=np.int32)
    token_type_ids = np.zeros((max_len), dtype=np.int32)
    
    # removing extra spaces and encoding tweet
    tweet = " " + " ".join(tweet.split())
    selected_text = " ".join(selected_text.split())
    encoded_tweet = tokenizer.encode(tweet)
        
    # filling the ids and attention_mask
    ids_valid = [0] + [sentiment_ids[sentiment]] + [2, 2] + encoded_tweet.ids + [2]
    len_valid = len(ids_valid)
    attention_mask_valid = [1] * len_valid

    ids[:len_valid] = ids_valid
    attention_mask[:len_valid] = attention_mask_valid
    
    
    selected_text_len = len(selected_text)

    for idx, char in enumerate(tweet):
        if char == selected_text[0]:
            if tweet[idx:selected_text_len+idx] == selected_text:
                char_start = idx
                char_end = char_start + selected_text_len

    assert char_start is not None
    assert char_end is not None
    assert tweet[char_start:char_end] == selected_text
        
    for token_index, (offset_start, offset_end) in enumerate(encoded_tweet.offsets):
        if (char_start >= offset_start) and (char_start <= offset_end):
#             print(tweet[offset_start:offset_end], char_start, offset_start, offset_end, token_index)
            token_start = token_index
        if (char_end-1 >= offset_start) and (char_end <= offset_end):
#             print(tweet[offset_start:offset_end], char_end, offset_start, offset_end, token_index)
            token_end = token_index
        
    assert token_start is not None
    assert token_end is not None
#     print(bpe_tokenizer.decode(encoded_tweet.ids[token_start:(token_end+1)]))
#     assert selected_text.lower() in bpe_tokenizer.decode(encoded_tweet.ids[token_start:(token_end+1)])
    
    token_start += 4
    token_end += 4
    
    return {
        "ids": ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "token_start": token_start,
        "token_end": token_end,
        "tweet": tweet,
        "selected_text": selected_text
    }
    

    

In [468]:
idx = 1495
record = train.iloc[idx]
tweet, selected_text, sentiment = record.text, record.selected_text, record.sentiment

In [469]:
print(tweet)
print(selected_text)
print(len(tweet))

 vocï¿½ que sumiu forever do msn.
vocï¿½ que sumiu forever do msn.
33


In [476]:
out1 = process_tweet(tweet, selected_text, sentiment, tokenizer, max_len)
out2 = process_data(tweet, selected_text, sentiment, tokenizer, max_len)

 vocï¿½ que sumiu forever do ms


In [477]:
out1["token_start"], out1["token_end"]

(4, 14)

In [478]:
out2["targets_start"], out2["targets_end"]

(4, 14)

In [486]:
sum(out2["mask"]), sum(out1["attention_mask"])

(18, 18)

In [473]:
e = bpe_tokenizer.encode(tweet)

In [475]:
for i, (o, token) in enumerate(zip(e.offsets, e.tokens)):
    print(tweet[o[0]: o[1]], "--", token)

 voc -- Ġvoc
ï¿ -- Ã¯
½ -- Â
  -- ¿
qu -- Â½
e su -- Ġque
miu  -- Ġsum
fo -- iu
rever do -- Ġforever
 ms -- Ġdo
n. -- Ġms
 -- n
 -- .


In [488]:
tweet.isascii()

False

In [495]:
(train
 .dropna() 
 .reset_index(drop=True)
 .assign(allascii=lambda x: x.text.apply(lambda x: x.isascii()))
 .allascii
 .sum()
)

27324

In [496]:
(test
 .dropna() 
 .reset_index(drop=True)
 .assign(allascii=lambda x: x.text.apply(lambda x: x.isascii()))
 .allascii
 .sum()
)

3520

In [497]:
train.index.size

27480

In [498]:
test.index.size

3534

In [500]:
(27480 - 27324) / 27480

0.005676855895196507

In [None]:
156

In [471]:
out1

{'ids': array([    0,  7974,     2,     2, 28312, 29667,  4056,  9470, 14989,
         1192,  6797,  9060,  6000,   109, 43601,   282,     4,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0], dtype=int32),
 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [60]:
kfold_indices(train)

Unnamed: 0,textID,text,selected_text,sentiment,fold
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,5
2,088c60f138,my boss is bullying me...,bullying me,negative,1
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,1
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,1
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,3
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,1
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,1


In [47]:
import pandas as pd

In [56]:
merged = 

In [58]:
merged.groupby(["fold", "sentiment"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,textID,text,selected_text
fold,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,negative,1556,1556,1556
1,neutral,2224,2224,2224
1,positive,1717,1717,1717
2,negative,1556,1556,1556
2,neutral,2224,2224,2224
2,positive,1716,1716,1716
3,negative,1556,1556,1556
3,neutral,2224,2224,2224
3,positive,1716,1716,1716
4,negative,1557,1557,1557


In [24]:
right

array([    1,     3,     7, ..., 27458, 27463, 27465])

In [29]:
train.iloc[right].sort_values(by='textID')

Unnamed: 0,textID,text,selected_text,sentiment
14660,000a596b74,wah....American Chocolate Cheese Cake for my c...,Cheese Cake for my chweet mummy ! muaxxx,positive
2953,000efa962a,I`m going to watch it now,I`m going to watch it now,neutral
2300,002210304b,I`d Love to work with you! http://myspace.com...,I`d Love,positive
14546,0025cd3554,Might be in the middle of a perfect weekend...,perfect,positive
15686,002f1aaedf,I`m ready homie..actually ima be working,working,positive
...,...,...,...,...
13016,ff68d28c4d,That would have been fun BUT we were soooo TI...,soooo TIRED yesterday. Long day at the office...,negative
24890,ff6d7db82a,"Back from SOAP, soooo fun.",soooo fun.,positive
18821,ff7edd8115,Well now I`m gonna be on a mission to find th...,Well now I`m gonna be on a mission to find the...,neutral
24227,ff84fbc83d,"Maybe you have, my memory sucks.",my memory sucks.,negative


In [32]:
kfold.get_n_splits(X=np.arange(train.index.size), y=train.sentiment)

5