# Homework 2: Rating Prediction from Review Text

In [2]:
YELP_TRAIN_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_train.json"
YELP_DEV_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_dev.json"
YELP_TEST_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_test.json"
STOP_WORD_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\stopword.list"

In [3]:
import json

def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = [json.loads(line) for line in file]
            return json_data
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error: {e}")
        return None

def get_stop_words(file_path):
    stop_words = {}
    with open(file_path, 'r') as file:
        for line_number, word in enumerate(file, start=1):
            word = word.strip()  # Remove leading and trailing whitespaces
            stop_words[word] = True
    return stop_words

train_data = load_json_file(YELP_TRAIN_DIR)

stop_words = get_stop_words(STOP_WORD_DIR)

In [4]:
def star_distribution(reviews):
    stars = {}

    for review in reviews:
        stars[review["stars"]] = 1 + stars.get(review["stars"], 0)

    total = sum(stars.values())
    distribution = [{"stars": key, "dist": stars[key] / total} for key in sorted(stars.keys())]

    return stars, distribution

star_distribution(train_data)

({5: 463084, 2: 112547, 4: 373469, 3: 178215, 1: 128038},
 [{'stars': 1, 'dist': 0.10199362251095907},
  {'stars': 2, 'dist': 0.0896536671358574},
  {'stars': 3, 'dist': 0.14196405313883823},
  {'stars': 4, 'dist': 0.29750118094273087},
  {'stars': 5, 'dist': 0.36888747627161445}])

In [5]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def preprocess_text(text):
    sentence = remove_punctuation(text.lower()).split(" ")
    return [word for word in sentence if word != '' and word not in stop_words and all(char.isalpha() for char in word)]

In [6]:
def build_vocab(reviews):
    vocab = {}
    for review in reviews:
         words = preprocess_text(review['text'])

         for word in words:
            vocab[word] = 1 + vocab.get(word, 0)

    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    top_2000_vocab = dict(sorted_vocab[:2000])

    return top_2000_vocab

In [7]:
vocab = build_vocab(train_data)

In [8]:
import numpy as np

def preprocess_dataset_ctf(reviews):
    vocab_index = list(vocab.keys())
    features = np.zeros((len(reviews), 2000), dtype=np.int32)
    targets = []

    for review_idx in range(len(reviews)):

        sentence = preprocess_text(reviews[review_idx]['text'])
        targets.append(reviews[review_idx]["stars"])

        reviews[review_idx] = None
        
        for word in sentence:
            if word in vocab and features[review_idx][vocab_index.index(word)] == 0:
                num_of_word_count = sentence.count(word)
                features[review_idx][vocab_index.index(word)] = num_of_word_count

    return features, targets

def preprocess_dataset_ctf_test(reviews):
    vocab_index = list(vocab.keys())
    features = np.zeros((len(reviews), 2000), dtype=np.int32)

    for review_idx in range(len(reviews)):
        sentence = preprocess_text(reviews[review_idx]['text'])

        reviews[review_idx] = None
        
        for word in sentence:
            if word in vocab and features[review_idx][vocab_index.index(word)] == 0:
                num_of_word_count = sentence.count(word)
                features[review_idx][vocab_index.index(word)] = num_of_word_count

    return features

In [9]:
train_features, train_targets = preprocess_dataset_ctf(train_data)
train_features, train_targets = np.array(train_features), np.array(train_targets)
train_targets -= 1

In [10]:
import torch
from torch.utils.data import Dataset

class YelpDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.int)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.targets[index]

train_dataset = YelpDataset(train_features, train_targets)

In [11]:
train_features, train_targets = [], []

In [12]:
import torch
from torch.utils.data import Dataset

class YelpTestDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.int)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index]

## CNN

In [30]:
from torch import nn

class SentimentCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, context_size):
        super(SentimentCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv_1d = torch.nn.Conv1d(1, out_channels=50, kernel_size=50, stride=embed_dim, padding="valid")
        self.max_pooling_1d = torch.nn.MaxPool1d(100, stride=embed_dim)
        self.relu_1 = nn.ReLU()
        self.init_weights()

        self.fc_1 = nn.Linear(1000, 5)
        self.softmax = nn.Softmax(dim=1)

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)

    def forward(self, sentence):
        out = self.embedding(sentence)
        out = out.unsqueeze(1)
        out = out.flatten(2,3)
        out = self.conv_1d(out)
        out = torch.flatten(out, start_dim=1)
        out = self.max_pooling_1d(out)
        out = self.relu_1(out)
        out = self.fc_1(out)
        out = self.softmax(out)

        return out

In [35]:
from torch.optim import lr_scheduler
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentCNN(len(vocab), embed_dim=100, context_size=100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

In [36]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

In [37]:
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    for batch in train_loader:
        inputs, labels = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    average_loss = total_loss / len(train_loader)
    accuracy_train = correct_train / total_train
    return average_loss, accuracy_train

In [38]:
def get_hard_predictions(model, test_loader, device):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch.to(device)
            outputs = model(inputs)
            
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())

    return all_predictions

def get_soft_predictions(model, test_loader, device):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch.to(device)
            outputs = model(inputs)
            outputs.data = outputs.data * np.array([[1,2,3,4,5]])
            
            all_predictions.extend(torch.sum(outputs.data, 1).cpu().numpy())

    return all_predictions

In [39]:
epochs = 100
train_losses = []
train_accuracies = []

for epoch in range(epochs):
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, criterion, device)

    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    # Update the scheduler
    scheduler.step()

    print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, '
          f'Train Accuracy: {train_accuracy * 100:.2f}%')

Epoch 1/100, Train Loss: 1.4954, Train Accuracy: 37.83%
Epoch 2/100, Train Loss: 1.4944, Train Accuracy: 37.91%
Epoch 3/100, Train Loss: 1.4947, Train Accuracy: 37.81%
Epoch 4/100, Train Loss: 1.4945, Train Accuracy: 37.82%
Epoch 5/100, Train Loss: 1.4945, Train Accuracy: 37.81%


KeyboardInterrupt: 

In [19]:
dev_data = load_json_file(YELP_DEV_DIR)
dev_features = preprocess_dataset_ctf_test(dev_data)
dev_features = np.array(dev_features)

In [20]:
dev_dataset = YelpTestDataset(dev_features)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)

In [21]:
def merge_and_save_vectors(vector1, vector2, output_file_path):
    merged_vector = np.column_stack((vector1, vector2))
    np.savetxt(output_file_path, merged_vector, fmt=['%d', '%.3f'], delimiter=' ')

In [40]:
merge_and_save_vectors(get_hard_predictions(model, dev_loader, device), get_soft_predictions(model, dev_loader, device), "./dev-predictions.txt")

In [23]:
test_data = load_json_file(YELP_TEST_DIR)
test_features = preprocess_dataset_ctf_test(test_data)
test_features = np.array(test_features)

In [24]:
test_dataset = YelpTestDataset(test_features)
test_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)

In [41]:
merge_and_save_vectors(get_hard_predictions(model, test_loader, device), get_soft_predictions(model, test_loader, device), "./dev-predictions.txt")