# Sentiment Analysis with Deep Learning using BERT

## Prerequisites:


## introduction

### What is BERT?
BERT is highly used machine learning model in NLP subspace. BERT is large scale transformer based language model that can ve fine-tuned for a variety of tasks.

Tranformers is advancement of RNN. By transformers you can parallelize the input, trainign and inference.
take fixed inout size.


For more information original paper can be found here: 

[hugging face link]

    
## Data
Smile twitter dataset:


In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../data/smile_twitter_data/smile-annotations-final.csv', 
                 names=['id', 'text', 'category'])
df.set_index('id', inplace=True)

In [None]:
df.head()

## Basic pre-processing

In [None]:
df.category.value_counts()

In [None]:
df = df[~df.category.str.contains('\|')]

In [None]:
df = df[df.category != 'nocode']

In [None]:
df.category.value_counts()

In [None]:
possible_labels = df.category.unique()

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
label_dict

In [None]:
df['label'] = df.category.replace(label_dict)
df.head()

## Train Test split

In [None]:
X_train, X_val, y_train, y_test = train_test_split(df.index.values, 
                                                   df.label.values, 
                                                   test_size=0.15,
                                                  random_state = 17,
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['category', 'label', 'data_type']).count()

## Loading Tokenizer and Encoding our Data

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased',
            do_lower_case=True)

In [None]:
# convert tweets into some encoded form
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

In [None]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

In [None]:
encoded_data_train.keys()

In [None]:
## bert will need 'input_ids', 'attention_mask' and 'labels'

input_ids_train = encoded_data_train['input_ids']
attention_mask_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

In [None]:
input_ids_val = encoded_data_val['input_ids']
attention_mask_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

In [None]:
encoded_data_train['input_ids']

In [None]:
encoded_data_train['token_type_ids']

In [None]:
encoded_data_train['attention_mask']

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_mask_val, labels_val)

In [None]:
len(dataset_train)

In [None]:
len(dataset_val)

## Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
# this is BERT fine tuning step as we are already using available model and adding only the last layer for
# classification here
model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_dict),
        output_attentions=False,
        output_hidden_states=False
)

## Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

## Setting up Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

## Defining our Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score


In [None]:
def f1_score_func(preds,labels):
    preds_flat = np.argmax(preds, axis=1).flatten()