<a href="https://colab.research.google.com/github/malazbw/Tweet-Sentiment-Extraction-using-Bert/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
! pip install transformers



In [25]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader

In [26]:
#config
epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 16

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
input_path = "/content/drive/MyDrive/twitter/"

In [29]:
# Reading dataset
df = pd.read_csv(input_path + "train.csv").dropna().reset_index(drop=True)
test_df = pd.read_csv(input_path + 'test.csv')

In [30]:
# Returns the index of first occurrence of the substring
def find(df):
    return df['text'].find(df['selected_text'])

In [31]:
df['answer_start'] =df.apply(find, axis =1)
df['answer_end']  = df['answer_start'] + df['selected_text'].str.len()

In [32]:
df.head(5)

Unnamed: 0,textID,text,selected_text,sentiment,answer_start,answer_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,1,36
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,1,9
2,088c60f138,my boss is bullying me...,bullying me,negative,11,22
3,9642c003ef,what interview! leave me alone,leave me alone,negative,17,31
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,1,14


In [33]:
# Split the data
train_df, valid_df = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values )
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [34]:
# Split the features into 3 lists (questions, answers, context)
train_answers = train_df[['answer_start', 'answer_end', 'selected_text']].to_dict('records')
train_contexts = train_df['text'].values.tolist()
train_questions = train_df['sentiment'].values.tolist()

valid_answers = valid_df[['answer_start', 'answer_end', 'selected_text']].to_dict('records')
valid_contexts = valid_df['text'].values.tolist()
valid_questions = valid_df['sentiment'].values.tolist()

test_contexts = test_df['text'].values.tolist()
test_questions = test_df['sentiment'].values.tolist()

In [35]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [36]:
# Encoding

In [37]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [38]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [39]:

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, df):
        self.encodings = encodings
        self.sentiment = df['sentiment'].values
        self.tweet = df['text'].values
        self.selected_text = df['selected_text'].values

    def __getitem__(self, idx):

        return { 'input_ids': torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
                 'attention_mask': torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
                 'start_positions': torch.tensor(self.encodings["start_positions"][idx], dtype=torch.long),
                 'end_positions': torch.tensor(self.encodings["end_positions"][idx], dtype=torch.long),
                 'sentiment': self.sentiment[idx],
                 'tweet': self.tweet[idx],
                 'selected_text': self.selected_text[idx],
                }

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = TweetDataset(train_encodings, train_df)
val_dataset = TweetDataset(valid_encodings, valid_df)

In [40]:
class TweetTestSet(torch.utils.data.Dataset):
    def __init__(self, encodings, df):
        self.encodings = encodings
        self.sentiment = df['sentiment'].values
        self.tweet = df['text'].values

    def __getitem__(self, idx):
        return { 'input_ids': torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
          'attention_mask': torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
          'sentiment': self.sentiment[idx],
          'tweet': self.tweet[idx],
        }

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TweetTestSet(test_encodings, test_df)

In [41]:
# utils
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
    
class AverageMeter():
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [42]:
# Evaluation Functions

In [43]:
def eval_fn(model, device):
  
    # switch model out of training mode
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()

    val_loader = DataLoader(val_dataset, batch_size=16)
    # initialize loop for progress bar
    tk0 = tqdm(val_loader, total=len(val_loader))
    with torch.no_grad():
    # loop through batches
      for batch in tk0:
          # pull batched items from loader
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_true = batch['start_positions'].to(device)
          end_true = batch['end_positions'].to(device)
          # make predictions
          outputs = model(input_ids, attention_mask=attention_mask)

          for sample_id, tweet in enumerate(batch["tweet"]):
            answer_target = batch["selected_text"][sample_id]
            if batch["sentiment"][sample_id] == "neutral":
              answer_pred = tweet
            else:
              start_true = batch["start_positions"][sample_id]
              end_true = batch["end_positions"][sample_id]
              start_pred = torch.argmax(outputs['start_logits'][sample_id])
              end_pred = torch.argmax(outputs['end_logits'][sample_id])
              answer_pred = tokenizer.decode(batch['input_ids'][sample_id][start_pred:end_pred+1])
            jaccard_score = jaccard(answer_pred.strip(), answer_target.strip())
            jaccard_scores.append(jaccard_score)


      jaccards.update(np.mean(jaccard_scores), input_ids.size(0))
      losses.update(loss.item(), input_ids.size(0))
      tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    return jaccards.avg

In [44]:
# Training and evaluation on valid data

In [45]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [46]:
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):

    # setup loop (we use tqdm for the progress bar)
    losses = AverageMeter()
    jaccards = AverageMeter()

    #loop = tqdm(train_loader, leave=True)
    tk0 = tqdm(train_loader, total=len(train_loader))
    print("training")
    for batch in tk0:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        jaccard_scores = []

        for sample_id, tweet in enumerate(batch["tweet"]):
          answer_target = batch["selected_text"][sample_id]
          if batch["sentiment"][sample_id] == "neutral":
              answer_pred = tweet
          else:
            start_true = batch["start_positions"][sample_id]
            end_true = batch["end_positions"][sample_id]
            start_pred = torch.argmax(outputs['start_logits'][sample_id])
            end_pred = torch.argmax(outputs['end_logits'][sample_id])
            answer_pred = tokenizer.decode(batch['input_ids'][sample_id][start_pred:end_pred+1])
       

          jaccard_score = jaccard(answer_pred.strip(), answer_target.strip())
          jaccard_scores.append(jaccard_score)
        

        jaccards.update(np.mean(jaccard_scores), input_ids.size(0))
        losses.update(loss.item(), input_ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
      

    print("evaluation")
    eval_fn(model, device)

HBox(children=(FloatProgress(value=0.0, max=1546.0), HTML(value='')))

training

evaluation


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard = 0.6661628894547011


HBox(children=(FloatProgress(value=0.0, max=1546.0), HTML(value='')))

training

evaluation


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard = 0.6660226977220286


HBox(children=(FloatProgress(value=0.0, max=1546.0), HTML(value='')))

training

evaluation


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard = 0.6587219402059026


HBox(children=(FloatProgress(value=0.0, max=1546.0), HTML(value='')))

training

evaluation


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard = 0.6545240050226


HBox(children=(FloatProgress(value=0.0, max=1546.0), HTML(value='')))

training

evaluation


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard = 0.6588995403411595


In [47]:
# Evaluation on test data

In [48]:

model.eval()
losses = AverageMeter()
jaccards = AverageMeter()
final_output = []
#val_sampler = SequentialSampler(val_dataset)

test_loader = DataLoader(test_dataset, batch_size=16)
tk0 = tqdm(test_loader, total=len(test_loader))
with torch.no_grad():
# loop through batches
  for batch in tk0:
      # pull batched items from loader
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      # make predictions
      outputs = model(input_ids, attention_mask=attention_mask)
      for sample_id in range(len(batch["input_ids"])):

        if batch["sentiment"][sample_id] == "neutral":
            answer_pred = batch["tweet"][sample_id]
        else:
          start_pred = torch.argmax(outputs['start_logits'][sample_id])
          end_pred = torch.argmax(outputs['end_logits'][sample_id])
          answer_pred = tokenizer.decode(batch['input_ids'][sample_id][start_pred:end_pred+1])
        final_output.append(answer_pred)


HBox(children=(FloatProgress(value=0.0, max=221.0), HTML(value='')))




In [49]:
sample=pd.read_csv(input_path + 'sample_submission.csv')
sample['selected_text'] = final_output
sample.to_csv("submission.csv", index = False)
display(sample.head(10))

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting ( precisely - - skyscrapers galore )....
2,eee518ae67,shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
5,726e501993,that ` s great!!
6,261932614e,i think everyone hates me on here lol
7,afa11da83f,completely blocked
8,e64208b4ef,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...
