<a href="https://colab.research.google.com/github/nklsKrmnn/LSC_Sentiment_Analysis/blob/main/Projekt_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%pip install -q transformers

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import transformers
from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
train_set = pd.read_csv ("train.tsv", sep = '\t')
pd.set_option('display.max_colwidth', None)
train_set.head(5)

In [6]:
train_set.info()

In [7]:
train_set.describe()

In [8]:
sns.countplot(x='Sentiment', data=train_set)
# --> The classes are not equaly represented. We need to agument or bootstrap

In [9]:
train_set_full_sentences = train_set.groupby('SentenceId').first().reset_index()
train_set_full_sentences.head()

In [10]:
sns.countplot(x='Sentiment', data=train_set_full_sentences)
# --> besser

In [11]:
data_preprocessed = train_set_full_sentences.join(pd.get_dummies(train_set_full_sentences['Sentiment'], dtype=float)).drop('Sentiment', axis=1)
data_preprocessed.head()

In [12]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [13]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
class data(Dataset):

    def __init__(self, input, targets, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.input = input
        self.targets = targets
        self.max_len = max_len

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):
        text = str(self.input[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [16]:
data_preprocessed.loc[:, 0:].head()

In [17]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=data_preprocessed.sample(frac=train_size,random_state=200)
test_dataset=data_preprocessed.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(train_set_full_sentences.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = data(train_dataset['Phrase'], train_dataset.loc[:, 0:], tokenizer, MAX_LEN)
testing_set = data(test_dataset['Phrase'], test_dataset.loc[:, 0:], tokenizer, MAX_LEN)

In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 5)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)


In [20]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [21]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [22]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [23]:
for epoch in range(EPOCHS):
    train(epoch)