## Reading reviews

Reads the reviews from each folder, then joins them together.

In [None]:
from read_and_preprocess import read_reviews, preprocess_reviews

# Read positive and negative reviews
pos_reviews = read_reviews('../data/pos') 
neg_reviews = read_reviews('../data/neg') 

# Merge them in single dictionary
all_reviews = {}
all_reviews.update(pos_reviews)
all_reviews.update(neg_reviews)

## Dimensionality reduction

Optional functions to reduce dimensionality of feature space (for BoW and TF-IDF)

In [None]:
import numpy as np

def select_top_features_tfidf(tfidf, top_n=10000):
    """Decreases the feature space by selecting the top_n features with higher tf-idf scores."""
    # Calculate the average tf-idf score for each feature
    avg_scores = np.mean(tfidf, axis=0)

    # Get indices of top features
    top_indices = np.argsort(avg_scores)[::-1][:top_n]
    
    return tfidf[:, top_indices]

def select_top_features_bow(bow, top_n=10000):
    """Decreases the feature space by selecting the top_n features with higher counts."""
    # Sum feature occurrences
    sums = np.array(bow.sum(axis=0)).ravel()

    # Get indices of top features
    top_indices = np.argsort(sums)[::-1][:top_n]

    # Select only columns corresponding to top features
    return bow[:, top_indices]

## Feature set generation

Generates features sets using fitting or transform options, applies preprocessing, feature generation, and feature selection.

In [None]:
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from tfidf import calcualate_tfs, calculate_idfs, calculate_tfidfs

# Global variables to store fitted data
fitted_bow_vectorizer = None
fitted_idfs = None
fitted_term_to_index = None

def generate_features(contents, params, fit = True):
    """Generates a set of features using preprocessing and tf-idf or BoW, either fits or not."""
    global fitted_bow_vectorizer, fitted_idfs, fitted_term_to_index

    a_tfidf = params[-1]
    a_bow = params[-2]

    preprocessed_contents = preprocess_reviews(contents, *params[:-2])

    # Choose whether to use BoW or tf-idf
    if a_bow:
        # Calculate counts for each review
        bow = [Counter(content.split()) for content in preprocessed_contents]

        if fit:
            # Fits feature space
            fitted_bow_vectorizer = DictVectorizer()
            fitted_bow_vectorizer.fit(bow)
        
        # Transform using fitted (just now or previously) BoW
        bow = fitted_bow_vectorizer.transform(bow)

        # Select top_n features
        sparse_vector = bow #select_top_features_bow(bow, 50000)

    elif a_tfidf:
        if fit: 
            # Calculate tfs over each review and whole vocabulary
            doc_terms, doc_count = calcualate_tfs(preprocessed_contents)

            # Fit idfs
            num_docs = len(preprocessed_contents)
            fitted_idfs = calculate_idfs(doc_count, num_docs)

            # Create a sorted list of all unique terms
            all_terms = sorted(set(term for terms in doc_terms.values() for term in terms))

            # Map from terms to column indices
            fitted_term_to_index = {term: index for index, term in enumerate(all_terms)}  
            
        # Transform using fitted (just now or previously) tf-idf
        tfidfs = calculate_tfidfs(preprocessed_contents, fitted_idfs, fitted_term_to_index)

        # Select top_n features
        sparse_vector = tfidfs #select_top_features_tfidf(tfidfs, 50000)

    return sparse_vector

## Model training and evaluation

#### Uses Implemented NB, Multinomial NB, SGD Logistic Regression, and Linear SVC

Applies training and evalation on each model for each feature set generated. 

In [None]:
# Sets of features 
# params: remove stopwords, remove punctuation, stemming, lemmatization, BoW, TF-IDF
params_1 = [False, True, False, False, False, True] # 1: punctuation, TFIDF
params_2 = [True, True, False, False, False, True] # 2: stopwords, punctuation, TFIDF
params_3 = [True, True, False, False, True, False] # 3: stopwords, punctuation, BoW 
params_4 = [True, True, True, False, False, True] # 4: stopwords, punctuation, stemming, TFIDF 
params_5 = [True, True, False, True, True, False] # 5: stopwords, punctuation, lemmatization, BoW 
all_params = [params_1, params_2, params_3, params_4, params_5]

from sklearn.model_selection import train_test_split

# Splitting reviews and labels up
X = np.array([review['content'] for review in all_reviews.values()])
y = np.array([review['label'] for review in all_reviews.values()])

# Separate out the train, dev, and test sets
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size = 0.15, stratify = y, random_state = 31)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size = 0.15 / 0.85, stratify = y_train_dev, random_state = 31)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from naive_bayes import ImplementedNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

for i, params in enumerate(all_params, start=1):
    # Generating features for set
    train_features = generate_features(X_train, params)

    # Generate features for dev and test sets without fitting (applies feature space of train set)
    dev_features = generate_features(X_dev, params, fit = False)
    test_features = generate_features(X_test, params, fit = False)

    print(f"\nResults for feature set {i}:")
    rows, columns = train_features.shape
    print(f"Feature set size: {columns}\n")

    ###########################################

    # Implemented Naive Bayes classifier
    print("Implemented Naive Bayes Classifier")
    classifier = ImplementedNB()
    classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred = classifier.predict(dev_features)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    print("Development set accuracy:", dev_accuracy)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred))

    ###########################################

    # Multinomial Naive Bayes classifier
    print("Multinomial Naive Bayes Classifier")
    classifier = MultinomialNB()
    classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred = classifier.predict(dev_features)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    print("Development set accuracy:", dev_accuracy)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred))

    ###########################################

    # SGD Logistic Regression
    print("SGD Logistic Regression")
    sgd_logistic = SGDClassifier(loss='log_loss', random_state=31)
    sgd_logistic.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred_sgd = sgd_logistic.predict(dev_features)
    dev_accuracy_sgd = accuracy_score(y_dev, y_dev_pred_sgd)
    print("Development set accuracy:", dev_accuracy_sgd)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred_sgd))

    ###########################################

    # SVM Classifier
    print("Linear SVC Classifier")
    svm_classifier = LinearSVC(dual=False, random_state=31)
    svm_classifier.fit(train_features, y_train)

    # Evaluate on the development set
    y_dev_pred_svm = svm_classifier.predict(dev_features)
    dev_accuracy_svm = accuracy_score(y_dev, y_dev_pred_svm)
    print("Development set accuracy:", dev_accuracy_svm)
    print("Development set classification report:\n", classification_report(y_dev, y_dev_pred_svm))


## Hyperparameter tuning

#### Performs hyperparameter tuning using grid search on Linear SVC model. 

Using hyperparameters:

* C -> smaller C indicates stronger regularisation

* tol -> tolerance for stopping criteria

* max_iter -> maximum number of iterations allowed

In [None]:
# Run hyperparameter optimisation on the dev set 
# (using chosen best combination of method, and feature set)
# Linear SVC Classifier on feature set 1 (accuracy 0.858)

from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC

# Define the hyperparameters to test
C_values = [0.01, 0.1, 1]
tol_values = [1e-4, 1e-3, 1e-2]
max_iter_values = [1000, 5000, 10000]

# Generating features for train, dev, and test sets
train_features = generate_features(X_train, params_1)
dev_features = generate_features(X_dev, params_1, fit=False)
test_features = generate_features(X_test, params_1, fit=False)

# Initialize variables to store the best hyperparameters and the corresponding accuracy
best_accuracy = 0
best_params = {}

# Iterate over each combination of hyperparameters
for C in C_values:
    for tol in tol_values:
        for max_iter in max_iter_values:
            # Create and train the LinearSVC model
            svm_classifier = LinearSVC(C=C, tol=tol, max_iter=max_iter, dual=False, random_state=31)
            svm_classifier.fit(train_features, y_train)

            # Evaluate on the development set
            y_dev_pred = svm_classifier.predict(dev_features)
            dev_accuracy = accuracy_score(y_dev, y_dev_pred)

            # Update the best hyperparameters if current model is better
            if dev_accuracy > best_accuracy:
                best_accuracy = dev_accuracy
                best_params = {'C': C, 'tol': tol, 'max_iter': max_iter}

# Train a new model using the best hyperparameters
best_svm = LinearSVC(**best_params, dual=False, random_state=31)
best_svm.fit(train_features, y_train)

# Evaluate the best model on the test set
y_test_pred = best_svm.predict(test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print("Best hyperparameters (based on development set):", best_params)
print("Test set accuracy with best hyperparameters:", test_accuracy)
print("Test set classification report:\n", test_report)