In [1]:
import os
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from bs4 import BeautifulSoup # used for HTML parsing
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def read_reviews(folder_path):
    """Reads the reviews in the folder path, storing label (derived from star rating) and content in a dictionary."""
    reviews = {}

    # Retrieve all review files in directory
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Verify file path is valid before reading
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Remove HTML tags (e.g. <br></br>)
                soup = BeautifulSoup(content, "html.parser")
                content = soup.get_text()

                # Extract star rating from filename (id_star.txt format)
                star_rating = int(filename.split('_')[-1].split('.')[0])
                # Assign labels given extracted star rating
                label = 1 if star_rating >= 7 else 0

                # Assign each review a dictionary of label and content
                reviews[filename] = {'content': content, 'label': label}

    return reviews

# Read positive and negative reviews
pos_reviews = read_reviews('../data/pos') 
neg_reviews = read_reviews('../data/neg') 

# Merge them in single dictionary
all_reviews = {}
all_reviews.update(pos_reviews)
all_reviews.update(neg_reviews)

  soup = BeautifulSoup(content, "html.parser")


In [3]:
def get_wordnet_pos(treebank_tag):
    """Converts the POS naming scheme from the Penn Treebank tag to a WordNet tag."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Wordnet lemmatize function will default to NOUN anyways
        return wordnet.NOUN

stoplist = set(stopwords.words('english'))

def verify_stopwords_punctuation(token, a_stopwords, a_punctuation):
    """Verifies whether a token is a stopword or part of punctuation given filtering criteria."""
    return (not a_stopwords or not token in stoplist) and (not a_punctuation or not token in string.punctuation)

def preprocess_reviews(contents, a_stopwords = False, a_punctuation = False, a_stemming = False, a_lemmatization = False):
    """Applies tokenization, n-gram generation, and further preprocessing based on supplied criteria."""
    # Initialisation
    stemmer = LancasterStemmer()
    lemmatizer = WordNetLemmatizer()
    n_gram_size = 2 # not in parameters as not part of feature selection
    
    preprocessed_contents = []

    for content in contents:
        # Apply tokenization
        tokens = nltk.word_tokenize(content)

        # Apply preprocessing based on criteria supplied
        if a_stemming:
            preprocessed_tokens = [stemmer.stem(token.lower()) for token in tokens if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]
        elif a_lemmatization:
            pos_tags = nltk.pos_tag(tokens)
            preprocessed_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)).lower() for token, pos in pos_tags if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]
        else:
            preprocessed_tokens = [token.lower() for token in tokens if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]

        # Generate n-grams (treating them as units due to _)
        preprocessed_n_grams = []

        for i in range(1, n_gram_size + 1):
            n_grams = list(ngrams(preprocessed_tokens, i))

            for n_gram in n_grams:
                preprocessed_n_grams.append('_'.join(n_gram))

        preprocessed_contents.append(' '.join(preprocessed_n_grams))
    
    return preprocessed_contents 

In [4]:
import math
import numpy as np
from collections import defaultdict

def calcualate_tfs(contents):
    """Calculates each n-gram's frequency in each review."""
    doc_terms = defaultdict(dict)
    doc_count = defaultdict(int)  # To store document count for each term
    
    # Assuming 'contents' is a list of review contents
    for i, content in enumerate(contents):
        terms = defaultdict(int)
        seen_terms = set()  # Keep track of terms seen in this document

        # Assuming 'content' is a string containing n-grams separated by white space
        for term in content.split():
            terms[term] += 1

            if term not in seen_terms:
                doc_count[term] += 1
                seen_terms.add(term)
    
        doc_terms[i] = terms

    return doc_terms, doc_count

def calculate_idfs(doc_count, num_docs):
    """Calculates idf for each n-gram."""
    idfs = {}

    for term, count in doc_count.items():
        # Calculate the idf for the given n-gram
        idfs[term] = math.log(num_docs / (1 + count), 10) # Logarithm with base 10

    return idfs


def calculate_tfidfs(contents, idfs, term_to_index):
    """Calculates tf-idf scores for each review."""
    # Initialize an empty array for storing tf-idf scores
    num_docs = len(contents)
    tfidf = np.zeros((num_docs, len(idfs)))

    # Assuming 'contents' is a list of review contents
    for i, content in enumerate(contents):
        terms = defaultdict(int)

        # Count tf for each term if it is in fitted feature space
        for term in content.split():
            if term in term_to_index:
                terms[term] += 1

        # Calculate tfidfs using fitted idfs
        for term, freq in terms.items():
            col_index = term_to_index[term]
            tfidf[i, col_index] = freq * idfs[term]

    return tfidf

In [5]:
def select_top_features_tfidf(tfidf, top_n=10000):
    """Decreases the feature space by selecting the top_n features with higher tf-idf scores."""
    # Calculate the average tf-idf score for each feature
    avg_scores = np.mean(tfidf, axis=0)

    # Get indices of top features
    top_indices = np.argsort(avg_scores)[::-1][:top_n]
    
    return tfidf[:, top_indices]

def select_top_features_bow(bow, top_n=10000):
    """Decreases the feature space by selecting the top_n features with higher counts."""
    # Sum feature occurrences
    sums = np.array(bow.sum(axis=0)).ravel()

    # Get indices of top features
    top_indices = np.argsort(sums)[::-1][:top_n]

    # Select only columns corresponding to top features
    return bow[:, top_indices]

In [6]:
from collections import Counter
from sklearn.feature_extraction import DictVectorizer

# Global variables to store fitted data
fitted_bow_vectorizer = None
fitted_idfs = None
fitted_term_to_index = None

def generate_features(contents, params, fit = True):
    """Generates a set of features using preprocessing and tf-idf or BoW, either fits (e.g. train set) or not (e.g. dev and test sets)."""
    global fitted_bow_vectorizer, fitted_idfs, fitted_term_to_index

    a_tfidf = params[-1]
    a_bow = params[-2]

    preprocessed_contents = preprocess_reviews(contents, *params[:-2])

    # Choose whether to use BoW or tf-idf
    if a_bow:
        # Calculate counts for each review
        bow = [Counter(content.split()) for content in preprocessed_contents]

        if fit:
            # Fits feature space
            fitted_bow_vectorizer = DictVectorizer()
            fitted_bow_vectorizer.fit(bow)
        
        # Transform using fitted (just now or previously) BoW
        bow = fitted_bow_vectorizer.transform(bow)
        # Select top_n features (removed because it underperformed)
        sparse_vector = bow #select_top_features_bow(bow, 50000)

    elif a_tfidf:
        if fit: 
            # Calculate tfs over each review and whole vocabulary
            doc_terms, doc_count = calcualate_tfs(preprocessed_contents)

            # Fit idfs
            num_docs = len(preprocessed_contents)
            fitted_idfs = calculate_idfs(doc_count, num_docs)

            # Create a sorted list of all unique terms
            all_terms = sorted(set(term for terms in doc_terms.values() for term in terms))

            # Map from terms to column indices
            fitted_term_to_index = {term: index for index, term in enumerate(all_terms)}  
            
        # Transform using fitted (just now or previously) tf-idf
        tfidfs = calculate_tfidfs(preprocessed_contents, fitted_idfs, fitted_term_to_index)
        # Select top_n features (removed because it underperformed)
        sparse_vector = tfidfs #select_top_features_tfidf(tfidfs, 50000)

    return sparse_vector

In [7]:
# Implementing a Naive Bayes classifier
from scipy import sparse

class ImplementedNB:
    """
    Class which implements a simplified Naive Bayes classifier:
        - It has binary class prediction (for neg or pos)
        - The prior probabilities are equal (0.5) for each class, which 
        works due to equal splitting of data (maintained by stratified sampling)
    """
    def __init__(self):
        """Initiates the simple NB classifier."""
        self.feature_log_prob = None

    def fit(self, X, y):
        """Trains the simple NB classifier using the data and labels provided."""
        # Define array to store counts for each feature (separated by class)
        feature_count = np.zeros((2, X.shape[1]))

        # Calculate counts for neg and pos reviews
        feature_count[0, :] = np.sum(X[y == 0, :], axis=0)
        feature_count[1, :] = np.sum(X[y == 1, :], axis=0)

        # Adding Laplace smoothing
        feature_count += 1

        # Calculate counts among each class
        class_count = np.sum(feature_count, axis=1)

        # Calculate the log probability of each feature given a class
        self.feature_log_prob = np.log(feature_count / class_count[:, np.newaxis])

    def predict(self, X):
        """Predicts the values for the simple NB classifier using the data provided."""
        # Ensure that X is in Compressed Sparse Row (CSR) format for efficient dot product
        if not sparse.isspmatrix_csr(X):
            X = sparse.csr_matrix(X)

        # Calculate the class probabilities for each review
        # Since we are maximising, and the prior probabilities are both 0.5, they can be ignored
        class_probabilities = X.dot(self.feature_log_prob.T)
        
        # Choose the class with higher probability for each review (indices correspond with class)
        # Dot product between CSR matrix and array results in array (np can be applied)
        predictions = np.argmax(class_probabilities, axis=1)

        return predictions

In [8]:
# Sets of features 
# params: remove stopwords, remove punctuation, stemming, lemmatization, BoW, TF-IDF
params_1 = [False, True, False, False, False, True] # 1: punctuation, TFIDF (accuracy 0.801)
params_2 = [True, True, False, False, False, True] # 2: stopwords, punctuation, TFIDF (accuracy 0.805)
params_3 = [True, True, False, False, True, False] # 3: stopwords, punctuation, BoW (accuracy 0.821)
params_4 = [True, True, True, False, False, True] # 4: stopwords, punctuation, stemming, TFIDF (accuracy 0.798)
params_5 = [True, True, False, True, True, False] # 5: stopwords, punctuation, lemmatization, BoW (accuracy 0.835)
all_params = [params_1, params_2, params_3, params_4, params_5]
# Further optimisations: 
# remove HTML content? (done) 

from sklearn.model_selection import train_test_split

# Splitting reviews and labels up
X = np.array([review['content'] for review in all_reviews.values()])
y = np.array([review['label'] for review in all_reviews.values()])

# Separate out the train, dev, and test sets
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size = 0.15, stratify = y, random_state = 31)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size = 0.15 / 0.85, stratify = y_train_dev, random_state = 31)

In [157]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

for i, params in enumerate(all_params, start=1):
    # Generating features for set
    train_features = generate_features(X_train, params)

    # Generate features for dev and test sets without fitting (applies feature space of train set)
    dev_features = generate_features(X_dev, params, fit = False)
    test_features = generate_features(X_test, params, fit = False)

    print(f"\nResults for feature set {i}:")
    rows, columns = train_features.shape
    print(f"Feature set size: {columns}\n")

    ###########################################

    # Implemented Naive Bayes classifier
    print("Implemented Naive Bayes Classifier")
    classifier = ImplementedNB()
    classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred = classifier.predict(dev_features)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    print("Development set accuracy:", dev_accuracy)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred))

    ###########################################

    # Multinomial Naive Bayes classifier
    print("Multinomial Naive Bayes Classifier")
    classifier = MultinomialNB()
    classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred = classifier.predict(dev_features)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    print("Development set accuracy:", dev_accuracy)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred))

    ###########################################

    # SGD Logistic Regression
    print("SGD Logistic Regression")
    sgd_logistic = SGDClassifier(loss='log_loss', random_state=31)
    sgd_logistic.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred_sgd = sgd_logistic.predict(dev_features)
    dev_accuracy_sgd = accuracy_score(y_dev, y_dev_pred_sgd)
    print("Development set accuracy:", dev_accuracy_sgd)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred_sgd))

    ###########################################

    # SVM Classifier
    print("Linear SVC Classifier")
    svm_classifier = LinearSVC(dual=False, random_state=31)
    svm_classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred_svm = svm_classifier.predict(dev_features)
    dev_accuracy_svm = accuracy_score(y_dev, y_dev_pred_svm)
    print("Development set accuracy:", dev_accuracy_svm)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred_svm))



Results for feature set 1:
Feature set size: 325693

Implemented Naive Bayes Classifier
Development set accuracy: 0.8283333333333334
Development set classification report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       300
           1       0.84      0.81      0.82       300

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600

Multinomial Naive Bayes Classifier
Development set accuracy: 0.8283333333333334
Development set classification report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       300
           1       0.84      0.81      0.82       300

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600

SGD Logistic Regression
Development set accuracy: 0.



Development set accuracy: 0.83
Development set classification report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.83       300
           1       0.83      0.84      0.83       300

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600


Results for feature set 2:
Feature set size: 332214

Implemented Naive Bayes Classifier
Development set accuracy: 0.805
Development set classification report:
               precision    recall  f1-score   support

           0       0.78      0.85      0.81       300
           1       0.83      0.76      0.80       300

    accuracy                           0.81       600
   macro avg       0.81      0.80      0.80       600
weighted avg       0.81      0.81      0.80       600

Multinomial Naive Bayes Classifier
Development set accuracy: 0.805
Development set classification report:
        

KeyboardInterrupt: 

In [11]:
# Run hyperparameter optimisation on the dev set 
# (using chosen best combination of method, and feature set)
# Linear SVC Classifier on feature set 1 (accuracy 0.858)

# Using hyperparameters:
# C -> smaller C indicates stronger regularisation (affects decision boundary)
# tol -> tolerance for stopping criteria
# max_iter -> maximum number of iterations allowed

from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC

# Define the hyperparameters to test
C_values = [0.01, 0.1, 1]
tol_values = [1e-4, 1e-3, 1e-2]
max_iter_values = [1000, 5000, 10000]

# Generating features for train, dev, and test sets
train_features = generate_features(X_train, params_1)
dev_features = generate_features(X_dev, params_1, fit=False)
test_features = generate_features(X_test, params_1, fit=False)

# Initialize variables to store the best hyperparameters and the corresponding accuracy
best_accuracy = 0
best_params = {}

# Iterate over each combination of hyperparameters
for C in C_values:
    for tol in tol_values:
        for max_iter in max_iter_values:
            # Create and train the LinearSVC model
            svm_classifier = LinearSVC(C=C, tol=tol, max_iter=max_iter, dual=False, random_state=31)
            svm_classifier.fit(train_features, y_train)

            # Evaluate on the development set
            y_dev_pred = svm_classifier.predict(dev_features)
            dev_accuracy = accuracy_score(y_dev, y_dev_pred)

            # Update the best hyperparameters if current model is better
            if dev_accuracy > best_accuracy:
                best_accuracy = dev_accuracy
                best_params = {'C': C, 'tol': tol, 'max_iter': max_iter}

# Train a new model using the best hyperparameters
best_svm = LinearSVC(**best_params, dual=False, random_state=31)
best_svm.fit(train_features, y_train)

# Evaluate the best model on the test set
y_test_pred = best_svm.predict(test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print("Best hyperparameters (based on development set):", best_params)
print("Test set accuracy with best hyperparameters:", test_accuracy)
print("Test set classification report:\n", test_report)

In [None]:
# BERT