In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("data/preprocessed/combined.csv")

In [5]:
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score
0,311210,Call of Duty: Black Ops III,bored.,0
1,206420,Saints Row IV,due to you the fact means that this is just th...,0
2,222880,Insurgency,"i didnt play up it much, and my friend is shar...",0
3,348020,,"this game is absolutely awful, the controls ar...",0
4,253510,Warmachine Tactics,really are poor work from the devs who just se...,0


In [7]:
df['review_score'].value_counts()

review_score
1    13730
0     9484
Name: count, dtype: int64

In [9]:
df.isnull().sum()

app_id            0
app_name        793
review_text       0
review_score      0
dtype: int64

In [11]:
df.duplicated(subset=['review_text']).sum()

58

In [13]:
df = df.drop_duplicates(subset=['review_text'])

In [15]:
model_name = "bert-base-uncased"

In [17]:
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModel
import numpy as np

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
df = df.rename(columns = {"review_text": "Review", "review_score": "Rating"}).drop(columns = ["app_id","app_name"])

In [21]:
df.head()

Unnamed: 0,Review,Rating
0,bored.,0
1,due to you the fact means that this is just th...,0
2,"i didnt play up it much, and my friend is shar...",0
3,"this game is absolutely awful, the controls ar...",0
4,really are poor work from the devs who just se...,0


In [22]:
class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.dataset = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        review_text = self.dataset.iloc[idx, 0]  # Assuming reviewText is the first column
        labels = self.dataset.iloc[idx, 1]  # Assuming sentiment is the second column

        # Tokenize the review text
        encoding = self.tokenizer.encode_plus(
          review_text,
          add_special_tokens=True,  # Add [CLS] token at the start for classification
          max_length=self.max_length,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'review_text': review_text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(), # this is NOT self-attention!
          'labels': torch.tensor(labels, dtype=torch.long)
        }

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
review_dataset = ReviewDataset(df, tokenizer, 512)

In [24]:
review_dataset[0]

{'review_text': 'bored.',
 'input_ids': tensor([  101, 11471,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             

In [32]:
tokenizer.decode(review_dataset[0]['input_ids'])

'[CLS] bored. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [34]:
train_size = int(0.8 * len(df))
val_size = len(df) - train_size
train_dataset, test_dataset = random_split(review_dataset, [train_size, val_size])

In [36]:
len(train_dataset), len(test_dataset)

(18524, 4632)

In [38]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [40]:
# Load DistilBertForSequenceClassification, a DistilBERT model pre-configured for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable_params)

Trainable parameters: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.lay

In [44]:
# Freeze DistilBERT parameters
for param in model.bert.parameters():
    param.requires_grad = False

In [46]:
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable_params)

Trainable parameters: ['classifier.weight', 'classifier.bias']


In [49]:
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments
import numpy as np

# Define a custom compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = 1)  # Predicted class (0 or 1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy
    }

In [51]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    eval_strategy = "epoch",
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size = 8,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to = None
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset - needs to be a 🤗 Dataset object
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics
)

In [53]:
# trainer.evaluate()

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6203,0.595368,0.707254
2,0.5774,0.573177,0.676166
3,0.5673,0.543202,0.719128
4,0.5374,0.527631,0.73532
5,0.5015,0.520909,0.736615
6,0.6047,0.513543,0.741364
7,0.5688,0.508595,0.74525
8,0.583,0.506652,0.743955
9,0.5655,0.508714,0.741796
10,0.5053,0.503973,0.748057


TrainOutput(global_step=23160, training_loss=0.5507758456396521, metrics={'train_runtime': 13889.8891, 'train_samples_per_second': 13.336, 'train_steps_per_second': 1.667, 'total_flos': 4.87386918948864e+16, 'train_loss': 0.5507758456396521, 'epoch': 10.0})

In [60]:
from tqdm import tqdm

In [62]:
model.eval()
total_correct = 0
total = 0

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].float().to(device)  # Ensure labels are float for BCEWithLogitsLoss

    with torch.inference_mode():
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out.logits  # Shape: (batch_size, 1)

    # Apply sigmoid and threshold for binary classification
    predictions = torch.argmax(logits, axis=1).float()

    # Comparison with labels
    total_correct += (predictions.squeeze() == labels).sum().item()
    total += labels.size(0)  # Total samples in the batch

accuracy = total_correct / total
print(f'Test Accuracy: {accuracy:.4f}')


100%|██████████| 290/290 [04:13<00:00,  1.15it/s]

Test Accuracy: 0.7481



