# Read the dataset from a CSV file.

In [2]:
import pandas as pd

In [8]:
data = pd.read_csv('/content/IMDB Dataset.csv')

In [9]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Process the Data

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def process_data(row):

    text = row['review']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=512)

    label = 0
    if row['sentiment'] == 'positive':
        label += 1

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [12]:
processed_data = []

sample_data = pd.concat([
    data[data['sentiment'] == 'positive'].sample(frac=0.2),
    data[data['sentiment'] == 'negative'].sample(frac=0.2)
])

for i in range(len(sample_data)):
    processed_data.append(process_data(sample_data.iloc[i]))

# Generate the Dataset

In [13]:
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [16]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

# Create a Model

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./result",
    evaluation_strategy="epoch",
    optim="adamw_torch",
    report_to="none",
    no_cuda=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.329,0.299659
2,0.16,0.369388
3,0.0456,0.484703


TrainOutput(global_step=3000, training_loss=0.19540439542134602, metrics={'train_runtime': 2619.0983, 'train_samples_per_second': 9.163, 'train_steps_per_second': 1.145, 'total_flos': 6314665328640000.0, 'train_loss': 0.19540439542134602, 'epoch': 3.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.4847026467323303,
 'eval_runtime': 56.0767,
 'eval_samples_per_second': 35.665,
 'eval_steps_per_second': 4.458,
 'epoch': 3.0}

# Save the Model

In [23]:
model.save_pretrained('/Users/theelusivegerbilfish/Python_Projects/IMDB_ratings/model/')

# Load the model

In [25]:
import torch

In [26]:
from transformers import AutoModelForSequenceClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_model = AutoModelForSequenceClassification.from_pretrained('/Users/theelusivegerbilfish/Python_Projects/IMDB_ratings/model/').to(device)


In [27]:
from transformers import AutoTokenizer

new_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Get Predictions

In [33]:
import torch
import numpy as np

def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = new_model(**encoding)

    logits = outputs.logits
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1)

    if label == 1:
        return {
            'sentiment': 'Positive',
            'probability': probs[1]
        }
    else:
        return {
            'sentiment': 'Negative',
            'probability': probs[0]
        }

In [37]:
get_prediction('I love this movie!')['sentiment']

'Positive'

In [47]:
from sklearn.metrics import confusion_matrix,classification_report

In [38]:
y_pred = data['review'].apply(lambda x: get_prediction(x)['sentiment']).to_numpy()

In [40]:
y_true = data['sentiment'].to_numpy()

In [41]:
y_true

array(['positive', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [44]:
y_pred = np.array([word.lower() for word in y_pred])

In [46]:
confusion_matrix(y_true,y_pred)

array([[21565,  3435],
       [ 2888, 22112]])

In [49]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

    negative       0.88      0.86      0.87     25000
    positive       0.87      0.88      0.87     25000

    accuracy                           0.87     50000
   macro avg       0.87      0.87      0.87     50000
weighted avg       0.87      0.87      0.87     50000

