In [1]:
!pip install transformers datasets torch scikit-learn pandas tqdm


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm
import random
import tarfile
import os
import requests


In [3]:
reviews_url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/scale_whole_review.tar.gz"
ratings_url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/scale_data.tar.gz"

# Download the datasets
!wget -O scale_whole_review.tar.gz $reviews_url
!wget -O scale_data.tar.gz $ratings_url

--2024-07-02 15:10:16--  http://www.cs.cornell.edu/people/pabo/movie-review-data/scale_whole_review.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.53
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.53|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8853204 (8.4M) [application/x-gzip]
Saving to: ‘scale_whole_review.tar.gz’


2024-07-02 15:10:16 (24.3 MB/s) - ‘scale_whole_review.tar.gz’ saved [8853204/8853204]

--2024-07-02 15:10:16--  http://www.cs.cornell.edu/people/pabo/movie-review-data/scale_data.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.53
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.53|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4029756 (3.8M) [application/x-gzip]
Saving to: ‘scale_data.tar.gz’


2024-07-02 15:10:17 (17.9 MB/s) - ‘scale_data.tar.gz’ saved [4029756/4029756]



In [4]:
os.makedirs("reviews", exist_ok=True)
os.makedirs("ratings", exist_ok=True)

In [5]:
with tarfile.open("scale_whole_review.tar.gz", "r:gz") as tar:
    tar.extractall(path="reviews")

with tarfile.open("scale_data.tar.gz", "r:gz") as tar:
    tar.extractall(path="ratings")

In [6]:
!ls reviews
!ls ratings
!ls ratings/scaledata

scaledata.README.1.0.txt  scale_whole_review
scaledata  scaledata.README.1.0.txt
Dennis+Schwartz  James+Berardinelli  Scott+Renshaw  Steve+Rhodes


In [7]:
reviews = []
ratings = []

reviews_base_path = "reviews/scale_whole_review"
ratings_base_path = "ratings/scaledata"

# List the author directories for reviews and ratings
review_authors = [d for d in os.listdir(reviews_base_path) if os.path.isdir(os.path.join(reviews_base_path, d))]
rating_authors = [d for d in os.listdir(ratings_base_path) if os.path.isdir(os.path.join(ratings_base_path, d))]

for author in rating_authors:
    # Load the ratings
    with open(f'{ratings_base_path}/{author}/rating.{author}', 'r', encoding='latin1') as f:
        ratings += [float(line.strip()) for line in f.readlines()]

    # Load the reviews
    review_author_path = f'{reviews_base_path}/{author}/txt.parag'
    review_files = [f for f in os.listdir(review_author_path) if os.path.isfile(os.path.join(review_author_path, f))]
    for review_file in review_files:
        with open(f'{review_author_path}/{review_file}', 'r', encoding='latin1') as f:
            reviews.append(f.read().replace('\n', ' '))

# Convert to df
reviews_df = pd.DataFrame(reviews, columns=["review"])
ratings_df = pd.DataFrame(ratings, columns=["rating"])

# Ensure that the lengths of reviews and ratings match
assert len(reviews_df) == len(ratings_df), "Mismatch between number of reviews and ratings"


In [13]:
print(reviews_df)

                                                 review
0     Cast: Andre Eisermann, Dana Vavrova, Ben Becke...
1     Cast: John Travolta, Gene Hackman, Rene Russo,...
2     Cast: Sylvester Stallone, Harvey Keitel, Ray L...
3     United States, 1998 U.S. Release Date: beginni...
4     Cast: Eddie Mills, Hilary Swank, Jason Robards...
...                                                 ...
5001  TOMORROW NEVER DIES (United Artists) Starring:...
5002  MAFIA!, like its predecessors AIRPLANE!, THE N...
5003  (New Line) Starring: Jackie Chan, Jackson Lou,...
5004  (Paramount) Starring: Andy Garcia, Ian Holm, L...
5005  (Columbia) Starring: Harrison Ford, Brad Pitt,...

[5006 rows x 1 columns]


In [20]:
print(reviews_df.iloc[5004]['review'])


(Paramount) Starring: Andy Garcia, Ian Holm, Lena Olin, Richard Dreyfuss, Ron Liebman, James Gandolfini. Screenplay: Sidney Lumet, based on the novel "Tainted Evidence" by Robert Daley. Producers: Thom Mount and Josh Kramer. Director: Sidney Lumet. MPAA Rating: R (profanity, violence, adult themes) Running Time: 115 minutes. Reviewed by Scott Renshaw. There are a couple of things you can generally count on finding in a Sidney Lumet film: people in positions of power or influence who become corrupted, and the noble crusaders who oppose them. In a career spanning six decades, Lumet has trained his camera on the back-room workings of power in America, creating gems like TWELVE ANGRY MEN, SERPICO and NETWORK in the process. Recent years, however, have found Lumet telling the same cynical story of perverted criminal justice over and over: PRINCE OF THE CITY, Q&A, GUILTY AS SIN. NIGHT FALLS ON MANHATTAN is not a bad film. It's just a terribly familiar one, one which Lumet races through so qu

In [22]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)


In [23]:
class MovieReviewsDataset(Dataset):
    def __init__(self, reviews, ratings, tokenizer, max_len):
        self.reviews = reviews
        self.ratings = ratings
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        rating = self.ratings[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'rating': torch.tensor(rating, dtype=torch.float)
        }


In [24]:
reviews = reviews_df['review'].tolist()
ratings = ratings_df['rating'].tolist()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

max_len = 512  # Model's max length
dataset = MovieReviewsDataset(reviews, ratings, tokenizer, max_len)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [25]:
reviews = reviews_df['review'].tolist()
ratings = ratings_df['rating'].tolist()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

max_len = 512  # Model's max length
dataset = MovieReviewsDataset(reviews, ratings, tokenizer, max_len)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [26]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * 3  # Number of epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.MSELoss().to('cuda' if torch.cuda.is_available() else 'cpu')


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def train_epoch(model, dataloader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []

    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        ratings = batch['rating'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1)

        loss = loss_fn(logits, ratings)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()

    return np.mean(losses)

def eval_model(model, dataloader, loss_fn, device):
    model = model.eval()
    losses = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ratings = batch['rating'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)

            loss = loss_fn(logits, ratings)
            losses.append(loss.item())

    return np.mean(losses)


In [28]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, device, scheduler)
    val_loss = eval_model(model, val_dataloader, loss_fn, device)

    print(f'Train loss: {train_loss:.4f}')
    print(f'Validation loss: {val_loss:.4f}')


Epoch 1/3


100%|██████████| 1001/1001 [03:22<00:00,  4.94it/s]
100%|██████████| 251/251 [00:32<00:00,  7.82it/s]


Train loss: 0.0366
Validation loss: 0.0326
Epoch 2/3


100%|██████████| 1001/1001 [03:34<00:00,  4.67it/s]
100%|██████████| 251/251 [00:32<00:00,  7.66it/s]


Train loss: 0.0334
Validation loss: 0.0323
Epoch 3/3


100%|██████████| 1001/1001 [03:34<00:00,  4.66it/s]
100%|██████████| 251/251 [00:32<00:00,  7.77it/s]

Train loss: 0.0321
Validation loss: 0.0325





In [29]:
from sklearn.metrics import mean_absolute_error, r2_score

def evaluate_model(model, dataloader, device):
    model = model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ratings = batch['rating'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)

            all_labels.extend(ratings.cpu().numpy())
            all_predictions.extend(logits.cpu().numpy())

    return all_labels, all_predictions

# Get predictions and true values
true_labels, predictions = evaluate_model(model, val_dataloader, device)

# Compute evaluation metrics
mae = mean_absolute_error(true_labels, predictions)
r2 = r2_score(true_labels, predictions)

print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')


100%|██████████| 251/251 [00:32<00:00,  7.69it/s]

Mean Absolute Error: 0.1472
R^2 Score: 0.0158





In [30]:
# Define the review
new_review = "The film was a thrilling adventure with stunning visuals and a compelling storyline. The performances were top-notch, especially from the lead actor. However, the pacing could have been better in some parts."

# Tokenize the review
inputs = tokenizer.encode_plus(
    new_review,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)


In [31]:
# Put the model in evaluation mode
model.eval()

# Make prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits.squeeze(-1)
    predicted_rating = logits.item()

print(f'Predicted Rating: {predicted_rating:.4f}')


Predicted Rating: 0.6157


In [34]:
# Define the review
new_review = "The best movie I ever seen, wow, just incredible!"

# Tokenize the review
inputs = tokenizer.encode_plus(
    new_review,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)


In [35]:
# Put the model in evaluation mode
model.eval()

# Make prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits.squeeze(-1)
    predicted_rating = logits.item()

print(f'Predicted Rating: {predicted_rating:.4f}')


Predicted Rating: 0.6045
