In [28]:
!pip install transformers
!pip install pandas
!pip install datasets
!pip install scikit-learn
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlo

In [1]:
import transformers # from HuggingFace
from transformers import AutoTokenizer # a librabry that has tokenizers for all of the different models
from transformers import pipeline
from transformers import AutoModelForSequenceClassification # the bert model I'll used
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from torch.utils.data import DataLoader

In [2]:
# This uses Bert's Tokenizer to tokenize the reviews 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Loading IMDb Dataset To Fine-Tune BERT

path = "clean_IMDBdataset.csv"
df = pd.read_csv(path, sep=",")

In [3]:
# Read a comma-separated values (csv) file into DataFrame
path = "clean_IMDBdataset.csv"
df = pd.read_csv(path, sep=",")

In [4]:
# This changes all of the review labels to be 0s or 1s instead of "positive or negative" 
df['Sentiment'] = df['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

Splitting (into Train, Validation, and Test) and Tokenizing the Data

In [5]:
# Splitting the dataset into 80% train, 10% test, 10% validation pandas dataframes
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(df, df, test_size=0.2, random_state=30)
X_test,X_valid,Y_test, Y_valid = train_test_split(X_test, Y_test, test_size=0.5, random_state=30)

# Switching each of the pandas dataframe into a Hugging Face Dataset
X_tr_dataset = Dataset.from_pandas(X_train)
X_test_dataset = Dataset.from_pandas(X_test)
X_v_dataset = Dataset.from_pandas(X_valid)

# The function that tokenizes strings, and more importantly, pads them so they can be stacked into batches
def tokenize_function(examples):
    return tokenizer(examples["Phrase"], padding="max_length", truncation=True)


# Tokenizes the data
tokenizedX_train = X_tr_dataset.map(tokenize_function, batched=True)
tokenizedX_test = X_test_dataset.map(tokenize_function, batched=True)
tokenizedX_valid = X_v_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Getting the DataSet into the format BERT will recognize

In [20]:
# Remove the column with the actual reviews (as opposed to the review ids) because the model does not accept raw text as an input!
tokenizedX_trainE = tokenizedX_train.remove_columns(["Phrase"])
tokenizedX_testE = tokenizedX_test.remove_columns(["Phrase"])
tokenizedX_validE = tokenizedX_valid.remove_columns(["Phrase"])

# Removes another unnecesary column
tokenizedX_trainE = tokenizedX_trainE.remove_columns(["__index_level_0__"])
tokenizedX_testE = tokenizedX_testE.remove_columns(["__index_level_0__"])
tokenizedX_validE = tokenizedX_validE.remove_columns(["__index_level_0__"])


# Makes sure the columns are titled in the way that the model expects
tokenizedX_trainf = tokenizedX_trainE.rename_column("Sentiment", "labels")
tokenizedX_testf = tokenizedX_testE.rename_column("Sentiment", "labels")
tokenizedX_validf = tokenizedX_validE.rename_column("Sentiment", "labels")


# Sets the format of the dataset to return PyTorch tensors instead of lists
tokenizedX_trainf.set_format("torch")
tokenizedX_testf.set_format("torch")
tokenizedX_validf.set_format("torch")


In [21]:
# Creates 2 DataLoaders, one for the training dataset and one for the testing dataset
# Batch_size = 8
train_dataloader = DataLoader(tokenizedX_trainf, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(tokenizedX_validf, batch_size=8)


In [22]:
# Loading the model with the expected number of labels, for the IMDb dataset, there are two different labels, 0 or 1!
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [23]:
# Creating the optimizer
from torch.optim import AdamW
 
optim = AdamW(model.parameters(), lr = 5e-5)

In [24]:
# This is a learning rate scheduler, which adjusts the learning rate during training by 
# reducing the learning rate according to a pre-defined schedule
from transformers import get_scheduler

num_epochs = 3
num_train_steps = num_epochs * len(train_dataloader)

# Thiis is the default learning rate scheduler from Hugging Face's Trainer class 
# num_warmup_steps (which is optional) allows the learning rate (lr) to start low over the number of warmup steps
# The learning rate will decrease in a linear fashion as it goes through the number of training steps
lr_scheduler = get_scheduler(
    name = "linear", optimizer = optim, num_warmup_steps = 0, num_training_steps = num_train_steps
)


In [25]:
# Allows for a connection to a GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [26]:
# This creates a progress bar
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_train_steps))

# Training loop!
model.train()
for ep in range(num_epochs):
  for batch in train_dataloader:
    # The batch is a dictionary, which is important to note
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward

    optim.step()
    lr_scheduler.step()
    optim.zero_grad()
    progress_bar.update(1)

  0%|          | 0/15000 [00:00<?, ?it/s]

In [29]:
# This is how I evaluate the model
import evaluate

# Works similarly to the training loop when loading the data from the dataloader
metric = evaluate.load("accuracy")
model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.4992}

In [89]:
# Loadinging the Rotten Tomatoes Dataset
rt_path = "RottenTomatoes/DataSet/train.tsv"
rt_df = pd.read_csv(rt_path, sep="\t")

In [90]:
# Removing punctuation
rt_df['Phrase'] = rt_df['Phrase'].str.replace(r'[^\w\s]+', '')
# Removing numbers
rt_df['Phrase'] = rt_df['Phrase'].str.replace(r'\d+', '')
# Making it all lower case
rt_df['Phrase'] = rt_df['Phrase'].str.lower()
# Remove non-asci characters
rt_df.Phrase.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
# Converting the star ratings to integers
rt_df['Sentiment']=rt_df['Sentiment'].astype(int)
# Getting rid of the 2-star ratings, because there were too many neutral reviews
rt_df= rt_df[rt_df['Sentiment']!=2]

In [91]:
# This changes all of the review labels to be 0s or 1s instead, so we can feed it to the Bert that was trained on polarity 
rt_df['Sentiment'] = rt_df['Sentiment'].apply(lambda x: 0 if x == 1 else x)
rt_df['Sentiment'] = rt_df['Sentiment'].apply(lambda x: 1 if x == 3 else x)
rt_df['Sentiment'] = rt_df['Sentiment'].apply(lambda x: 1 if x == 4 else x)

In [93]:
from sklearn.model_selection import train_test_split
rt_X_train,rt_X_test,rt_Y_train, rt_Y_test = train_test_split(rt_df, rt_df, test_size=0.2, random_state=30)
rt_X_test,rt_X_valid,rt_Y_test, rt_Y_valid = train_test_split(rt_X_test, rt_Y_test, test_size=0.5, random_state=30)

In [94]:

rt_X_test = pd.DataFrame(rt_X_test)

# Switching each of the pandas dataframe into a Hugging Face Dataset
rt_X_test_dataset = Dataset.from_pandas(rt_X_test)

# The function that tokenizes strings, and more importantly, pads them so they can be stacked into batches
def tokenize_function(examples):
    return tokenizer(examples["Phrase"], padding="max_length", truncation=True)

# Tokenizes the data
tokenized_rt_X_test = rt_X_test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/7648 [00:00<?, ? examples/s]

In [95]:
# Remove the column with the actual reviews (as opposed to the review ids) because the model does not accept raw text as an input!
tokenized_rt_X_testa = tokenized_rt_X_test.remove_columns(["Phrase"])

# Removes another unnecesary column
tokenized_rt_X_testb = tokenized_rt_X_testa.remove_columns(["PhraseId"])

# Removes another unnecesary column
tokenized_rt_X_testc = tokenized_rt_X_testb.remove_columns(["SentenceId"])

# Makes sure the columns are titled in the way that the model expects
tokenized_rt_X_testd = tokenized_rt_X_testc.rename_column("Sentiment", "labels")

tokenized_rt_X_teste = tokenized_rt_X_testd.remove_columns(["__index_level_0__"])


# Sets the format of the dataset to return PyTorch tensors instead of lists
tokenized_rt_X_teste.set_format("torch")


In [96]:
# Creating the DataLoader that will be used to test the model against the 
rt_eval_dataloader = DataLoader(tokenized_rt_X_teste, batch_size=8)

In [98]:
# evaluating the model with the Rotten Tomatoes dataset
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in rt_eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.5489016736401674}