# Sentiment fine-grained classifications-8 project

In [1]:
# Libraries we need to install - If it is already installed you can skip this cell
!pip install sentencepiece
!pip install transformers



In [1]:
# Libraries 
#pretrained model
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import EarlyStoppingCallback

# base model
from torchtext.legacy.data import Field,LabelField,BucketIterator,TabularDataset
from torchtext import vocab
from tqdm import tqdm
import torch.nn.functional as F

#preprocessing and evaluation
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

2021-10-20 22:30:08.459256: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-20 22:30:08.459289: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
import sqlite3

In [3]:
def read_train_dataset():
  conn = sqlite3.connect('../Freelancer/Input/reviews_train.db')
  cursor = conn.cursor ()  
  table_list = [a for a in cursor.execute("SELECT * FROM 'reviews' ")]
  names = list(map(lambda x: x[0], cursor.description))
  #print(names)
  cursor. execute("SELECT name FROM sqlite_master WHERE type='table';")
  reviews  = [a[0] for a in cursor.execute("SELECT review_title FROM 'reviews'")]
  ratings = [a[0] for a in cursor.execute("SELECT rating_diversity FROM 'reviews'")]

  nan_value = "None"
  df = pd.DataFrame ({'reviews':reviews,
                      'ratings': ratings})

  df = df.dropna()

  return df["reviews"], df["ratings"]

In [22]:
train_reviews, train_ratings = read_train_dataset()

In [23]:
train_reviews = train_reviews.values
train_ratings = train_ratings.values
#train_reviews = [" ".join(a) for a in train_reviews]
train_ratings = [int(a)-1 for a in train_ratings]
train_reviews = train_reviews[0:1000] 
train_ratings = train_ratings[0:1000]

In [2]:
def split_dataset_pretrained(dataframe):
    """ Split dataset into train, val and test
    Input:
        dataframe - dataframe dataset
    Returns:
        X_train list train sentences
        y_train list label of train dataset
        X_val list val sentences
        y_val list label of val dataset
        X_test list test sentences
        y_test list label of test dataset
    """
    X_train, temp_text, y_train, temp_labels = train_test_split(list(dataframe["sentences"].values), list(dataframe["label_encoded"].values), 
                                                                    random_state=seed, 
                                                                    test_size=0.2, 
                                                                    stratify=list(dataframe["label_encoded"].values))


    X_val, X_test, y_val, y_test = train_test_split(temp_text, temp_labels, 
                                                                random_state=seed, 
                                                                test_size=0.4, 
                                                                stratify=temp_labels)
    
    return X_train, y_train, X_val, y_val, X_test, y_test



In [6]:
def compute_metrics(p):
    """Compute metrics for evaluation
    p Lists prediction and gold labels for evaluation
    Reurns:
        eval_scores dictionary evaluation scores
    """
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    eval_scores = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    
    return eval_scores

In [7]:
# call model and tokenizer based on your pretraine model
def bert_model(output_label):
    """ Define bert pretrained tokenizer and model
    Input:
        output_label - int the number of classes in the dataset
    Returns:
        tokenizer
        model
    """
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
    
    return tokenizer, model

def distilbert_model(output_label):
    """ Define distilbert pretrained tokenizer and model
    Input:
        output_label - int the number of classes in the dataset
    Returns:
        tokenizer
        model
    """
    model_name = "distilbert-base-uncased"
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=output_label)
    
    return tokenizer, model


def alberta_model(output_label):
    """ Define alberta pretrained tokenizer and model
    Input:
        output_label - int the number of classes in the dataset
    Returns:
        tokenizer
        model
    """
    model_name = "albert-base-v2"
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=output_label)

    return tokenizer, model

def gpt2_model(output_label):
    """ Define GPT2 pretrained tokenizer and model
    Input:
        output_label - int the number of classes in the dataset
    Returns:
        tokenizer
        model
    """
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=output_label)
    
    return tokenizer, model


In [12]:
# Load saved model based on your pretrained model
def test_bert_model(model_path, dataset):
    """ Test with bert pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_bert_model_one_sentence(model_path,sentence,classes):
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]
    

def test_distilbert_model(model_path, dataset):
    """ Test with distilbert pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_distilbert_model_one_Sentence(model_path,sentence,classes):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]


def test_alberta_model(model, dataset):
    """ Test with alberta pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_alberta_model_one_Sentence(model_path,sentence,classes):
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]


def test_gpt2_model(model_path, dataset):
    """ Test with alberta pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_gpt2_model_one_Sentence(model_path,sentence,classes):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]

In [13]:
#Function split dataset into train test and val and convert to torch.data.Dataset
def prepare_dataset_pretrained(tokenizer, dataset):
    """ Prepare dataset
    Input:
        tokenizer - pretrained tokenizer
        dataset - dataframe 
    Returns:
        train_dataset - torch.utils.data.Dataset train Dataset
        val_dataset - torch.utils.data.Dataset val Dataset
        test_dataset - torch.utils.data.Dataset test Dataset
        y_test - list gold labels for the test data
    """
    X_train, y_train, X_val, y_val, X_test, y_test = split_dataset_pretrained(dataset)
    
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
    X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)      
    
    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    test_dataset = Dataset(X_test_tokenized, y_test)
    
    return train_dataset, val_dataset, test_dataset, y_test

In [8]:
# Dataset for pretrained model
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [24]:
# Parameters

seed = 1234
np.random.seed(seed) 
torch.manual_seed(seed)
#torch.cuda.manual_seed(seed)
#torch.backends.cudnn.deterministic = True  # cuda algorithms
#os.environ['PYTHONHASHSEED'] = str(seed)
    
batch_size = 64
lr = 1e-4
num_epochs = 500
model_name = 'gpt2' # model name (bert, alberta, distilbert or gpt2  for pretrained) (lstm, rnn, bilestm for base model)
output_path = "output-bert" #create a folder to save pretrained model
model_path = "bert"
embedding_path = "embeddings/glove.6B.50d.txt"
max_length = 512
dataset_base = True # boolean value to split dataset into 
dataset_path = "data/" # path where to save splitted data (it is necessary is dataset_base is True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use 'cuda' if available else 'cpu'

In [98]:
# Read dataset
def read_dataset(file_path):
    """ Read dataset
    Input:
        file_path - string the path of the dataset
    Returns:
        train dataframe 
    """
    train_data = pd.read_excel(file_path, 'Sheet1')
    
    ''' Should/Must statement
        Should/must statement
        should/must statement labels are 
        converted to Should/Must statement
        
        personalizing is converted to Personalizing''' 
    
    
    train_data.loc[(train_data['label'] == 'should/must statement') | (train_data['label'] == 'Should/must statement')] = 'Should/Must statement' 
    train_data.loc[train_data['label'] == 'personalizing'] = 'Personalizing' 
    
    #Label encoding 
    
    le = LabelEncoder()
    train_data["label_encoded"] = le.fit_transform(train_data["label"]) 
    np.save('classes.npy', le.classes_)
    return train_data

In [11]:
# prepare dataset for pretrained models
#split into train val and test
dataset = read_dataset('L2400.xlsx')

num_output = len(set(dataset["label_encoded"])) # number of classes in the dataset

In [25]:
seed = 1234
#le = LabelEncoder()
#train_ratings_encoded = le.fit_transform(train_ratings) 
#np.save('classes.npy', le.classes_)

X_train, X_val, y_train, y_val = train_test_split(train_reviews, train_ratings, 
                                                                random_state=seed, 
                                                                test_size=0.2, 
                                                                stratify=train_ratings)

In [None]:
num_output = len(set(train_ratings)) 
print(num_output)
# number of classes in the dataset
# select model for pretrained models
if model_name == 'bert':
    tokenizer, model = bert_model(num_output)
elif model_name == 'alberta':
    tokenizer, model = alberta_model(num_output)
elif model_name == 'distilbert':
    tokenizer, model = distilbert_model(num_output)
elif model_name == 'gpt2':
    tokenizer, model = gpt2_model(num_output)
else:
    print('model is not defined')
    
#train_dataset, val_dataset, test_dataset, y_test = prepare_dataset_pretrained(tokenizer, dataset)

5


In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_reviews, train_ratings, 
                                                                random_state=seed, 
                                                                test_size=0.2, 
                                                                stratify=train_ratings)




In [14]:
len(set(y_val))

5

In [21]:
X_train_tokenized = tokenizer(list(X_train), padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(list(X_val), padding=True, truncation=True, max_length=512) 

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

Using pad_token, but it is not set yet.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [16]:
# Train pretrained model
args = TrainingArguments(
output_dir = output_path,
evaluation_strategy = 'steps',
eval_steps = 500,
per_device_train_batch_size = batch_size,
per_device_eval_batch_size = batch_size,
num_train_epochs = num_epochs,
seed = seed,
load_best_model_at_end = True,)   

trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],)

trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 500
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 12500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.5 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-20 22:31:48.702174: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-20 22:31:48.702193: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5196,2.907294,0.395,0.395,0.395,0.395
1000,0.2157,3.370244,0.395,0.395,0.395,0.395
1500,0.2035,3.781756,0.38,0.38,0.38,0.38
2000,0.2019,3.776557,0.405,0.405,0.405,0.405


***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to output-bert/checkpoint-500
Configuration saved in output-bert/checkpoint-500/config.json
Model weights saved in output-bert/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to output-bert/checkpoint-1000
Configuration saved in output-bert/checkpoint-1000/config.json
Model weights saved in output-bert/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to output-bert/checkpoint-1500
Configuration saved in output-bert/checkpoint-1500/config.json
Model weights saved in output-bert/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to output-bert/checkpoint-2000
Configuration saved in output-bert/checkpoint-2000/config.json
Model weights saved in output-bert/checkpoin

TrainOutput(global_step=2000, training_loss=0.28516904449462893, metrics={'train_runtime': 6013.0716, 'train_samples_per_second': 66.522, 'train_steps_per_second': 2.079, 'total_flos': 690685123968000.0, 'train_loss': 0.28516904449462893, 'epoch': 80.0})

In [17]:
# Test model pretrained models
raw_pred, _, _ = trainer.predict(val_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)
y = (raw_pred, y_val)
compute_metrics(y) 

***** Running Prediction *****
  Num examples = 200
  Batch size = 32


{'accuracy': 0.395, 'precision': 0.395, 'recall': 0.395, 'f1': 0.395}

In [None]:
# Only test your data on trained model without training phase - pretrained models
model_path = "path where you stored your model"
test_data_file = "path of the file"
test_dataset = read_dataset(test_data_file)
    
if model_name == 'bert':
    predictions = test_bert_model(model_path, test_dataset)
elif model_name == 'alberta':
    predictions = test_alberta_model(model_path, test_dataset)
elif model_name == 'distilbert':
    predictions = test_distilbert_model(model_path, test_dataset)
elif model_name == 'gpt2':
    predictions = test_gpt2_model(model_path, test_dataset)
else:
    print('model is not defined')
    
y_true = list(dataset["label_encoded"].values)
y = (predictions, y_true)
compute_metrics(y)