# Homework 2: Rating Prediction from Review Text

In [1]:
YELP_TRAIN_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_train.json"
YELP_DEV_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_dev.json"
YELP_TEST_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\yelp_reviews_test.json"
STOP_WORD_DIR = r"C:\Users\Nebiyou Hailemariam\Desktop\development\Machine-Learning-with-Graphs\Assignment 2\data\stopword.list"

In [2]:
import json

def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = [json.loads(line) for line in file]
            return json_data
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error: {e}")
        return None

def get_stop_words(file_path):
    stop_words = {}
    with open(file_path, 'r') as file:
        for line_number, word in enumerate(file, start=1):
            word = word.strip()  # Remove leading and trailing whitespaces
            stop_words[word] = True
    return stop_words

train_data = load_json_file(YELP_TRAIN_DIR)

stop_words = get_stop_words(STOP_WORD_DIR)

In [3]:
def star_distribution(reviews):
    stars = {}

    for review in reviews:
        stars[review["stars"]] = 1 + stars.get(review["stars"], 0)

    total = sum(stars.values())
    distribution = [{"stars": key, "dist": stars[key] / total} for key in sorted(stars.keys())]

    return stars, distribution

star_distribution(train_data)

({5: 463084, 2: 112547, 4: 373469, 3: 178215, 1: 128038},
 [{'stars': 1, 'dist': 0.10199362251095907},
  {'stars': 2, 'dist': 0.0896536671358574},
  {'stars': 3, 'dist': 0.14196405313883823},
  {'stars': 4, 'dist': 0.29750118094273087},
  {'stars': 5, 'dist': 0.36888747627161445}])

In [4]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def preprocess_text(text):
    sentence = remove_punctuation(text.lower()).split(" ")
    return [word for word in sentence if word != '' and word not in stop_words and all(char.isalpha() for char in word)]

In [5]:
def build_vocab_df(reviews):
    vocab = {}
    for review in reviews:
        document_frequency  = {}
        words = preprocess_text(review['text'])

        for word in words:
            document_frequency[word] = 1

        for word in document_frequency.keys():
            vocab[word] = document_frequency[word] + vocab.get(word, 0)

    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    top_2000_vocab = dict(sorted_vocab[:2000])

    return top_2000_vocab

In [6]:
vocab = build_vocab_df(train_data)

In [7]:
vocab

{'place': 475992,
 'good': 466831,
 'food': 436360,
 'great': 406643,
 'like': 362248,
 'just': 354065,
 'service': 328811,
 'time': 314013,
 'really': 272247,
 'nice': 209372,
 'best': 201462,
 'got': 192849,
 'love': 180207,
 'little': 179572,
 'try': 165054,
 'friendly': 163309,
 'went': 162996,
 'staff': 156961,
 'pretty': 156204,
 'people': 156202,
 'came': 148329,
 'ordered': 147887,
 'restaurant': 146482,
 'come': 146469,
 'better': 144490,
 'make': 143636,
 'definitely': 142901,
 'going': 141009,
 'know': 140605,
 'way': 138764,
 'order': 137512,
 'menu': 134698,
 'vegas': 134470,
 'think': 132739,
 'right': 129031,
 'delicious': 128271,
 'want': 126131,
 'say': 125927,
 'night': 125003,
 'experience': 122477,
 'amazing': 119805,
 'sure': 117854,
 'chicken': 114938,
 'day': 114559,
 'eat': 113993,
 'bar': 113734,
 'new': 112967,
 'wait': 112091,
 'bit': 104981,
 'times': 103998,
 'bad': 103427,
 'fresh': 103130,
 'said': 102578,
 'wasnt': 99931,
 'recommend': 99221,
 'lot': 988

In [33]:
import numpy as np

def preprocess_dataset_ctf(reviews):
    vocab_index = list(vocab.keys())
    features = np.zeros((len(reviews), 2000), dtype=np.int32)
    targets = []

    for review_idx in range(len(reviews)):

        sentence = preprocess_text(reviews[review_idx]['text'])
        targets.append(reviews[review_idx]["stars"])

        reviews[review_idx] = None
        
        for word in sentence:
            if word in vocab and features[review_idx][vocab_index.index(word)] == 0:
                num_of_word_count = sentence.count(word)
                features[review_idx][vocab_index.index(word)] = num_of_word_count

    return features, targets

def preprocess_dataset_ctf_test(reviews):
    vocab_index = list(vocab.keys())
    features = np.zeros((len(reviews), 2000), dtype=np.int32)

    for review_idx in range(len(reviews)):
        sentence = preprocess_text(reviews[review_idx]['text'])

        reviews[review_idx] = None
        
        for word in sentence:
            if word in vocab and features[review_idx][vocab_index.index(word)] == 0:
                num_of_word_count = sentence.count(word)
                features[review_idx][vocab_index.index(word)] = num_of_word_count

    return features

def preprocess_dataset_ctf_test(reviews):
    vocab_index = list(vocab.keys())
    features = np.zeros((len(reviews), 2000), dtype=np.int32)

    for review_idx in range(len(reviews)):
        sentence = preprocess_text(reviews[review_idx]['text'])

        reviews[review_idx] = None
        
        for word in sentence:
            if word in vocab and features[review_idx][vocab_index.index(word)] == 0:
                num_of_word_count = sentence.count(word)
                features[review_idx][vocab_index.index(word)] = num_of_word_count

    return features

In [8]:
train_features, train_targets = preprocess_dataset_ctf(train_data)
train_features, train_targets = np.array(train_features), np.array(train_targets)

In [9]:
train_features, train_targets = np.array(train_features), np.array(train_targets)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets, test_size=0.05, random_state=42)

## SVM

In [15]:
from sklearn.model_selection import train_test_split
from sklearn import svm

clf = svm.LinearSVC(penalty='l2', loss='squared_hinge', random_state=0)

clf.fit(X_train, y_train)



In [16]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.574878919194494


# Logistic Regression

In [102]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def softmax(pred):
    return np.exp(pred) / np.sum(np.exp(pred), axis=0, keepdims=True)

class LogisticRegression:
    def __init__(self, dim, classes, lr, lambda_reg=1):
        self.lr = lr
        self.lambda_reg = lambda_reg
        self.classes = classes
        self.W = np.zeros((dim, self.classes))
        self.X = None

    def forward(self, X):
        self.X = X
        return softmax(np.matmul(self.W.T, X))

    def backward(self, output, label):
        dl_dw = np.matmul(label - output, self.X.T).T - self.lambda_reg * self.W
        self.W = self.W + self.lr * (dl_dw / self.X.shape[1])

In [None]:
intercept_column = np.ones((len(train_data), 1))
X_with_intercept = np.hstack((intercept_column, train_features))
num_classes = len(np.unique(train_targets))
label = np.eye(num_classes)[np.array(train_targets)-1]

X_train, X_test, Y_train, Y_test = train_test_split(X_with_intercept, label, test_size=0.0001, random_state=42)

In [128]:
model = LogisticRegression(dim=X_with_intercept.shape[1], classes=num_classes, lr=0.1, lambda_reg=1)
batch_size = 512

epochs = 0
while epochs < 10:
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size].T
        Y_batch = Y_train[i:i+batch_size].T
        pred = model.forward(X_batch)
        model.backward(output=pred, label=Y_batch)
    epochs += 1
    print(f"Epoch #{epochs}")

Epoch #1
Epoch #2
Epoch #3
Epoch #4
Epoch #5
Epoch #6
Epoch #7
Epoch #8
Epoch #9
Epoch #10


: 

In [119]:
test_pred = np.argmax(model.forward(X_test.T), axis=0)
accuracy = np.mean(test_pred == np.argmax(Y_test.T, axis=0)) * 100
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 57.14%


In [120]:
def get_hard_prediction(model, test):
    return np.argmax(model.forward(test.T), axis=0)

def get_soft_prediction(model, test):
    return np.sum(model.forward(test.T) * np.array([[1], [2], [3], [4], [5]]), axis=0)

In [121]:
def merge_and_save_vectors(vector1, vector2, output_file_path):
    merged_vector = np.column_stack((vector1, vector2))
    np.savetxt(output_file_path, merged_vector, fmt=['%d', '%.3f'], delimiter=' ')

### Predict on Dev Data

In [None]:
dev_data = load_json_file(YELP_DEV_DIR)
dev_features = preprocess_dataset_ctf_test(dev_data)
dev_features = np.array(dev_features)
intercept_column = np.ones((len(dev_features), 1))
dev_features = np.hstack((intercept_column, dev_features))

In [126]:
merge_and_save_vectors(get_hard_prediction(model, dev_features), get_soft_prediction(model, dev_features), "./dev-predictions.txt")

### Predict on Test Data

In [57]:
test_data = load_json_file(YELP_TEST_DIR)
test_features = preprocess_dataset_ctf_test(test_data)
test_features = np.array(test_features)
intercept_column = np.ones((len(test_features), 1))
test_features = np.hstack((intercept_column, test_features))

In [127]:
merge_and_save_vectors(get_hard_prediction(model, test_features), get_soft_prediction(model, test_features), "./test-predictions.txt")