In [None]:
import pandas as pd  # Importing the pandas library for data manipulation and analysis
import numpy as np  # Importing the numpy library for numerical operations
import string  # Importing string library which contains a number of functions to process standard python strings
import re  # Importing re library for regular expression operations
import nltk  # Importing nltk library to work with human language data (texts)
import tensorflow as tf  # Importing TensorFlow to build machine learning models

# Importing various functions and classes to process text and build machine learning models
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix,
    matthews_corrcoef,
    log_loss,
)
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

# Downloading necessary datasets for nltk
nltk.download('punkt')  # Tokenizers for various language corpora
nltk.download('stopwords')  # Common words that are usually ignored in text processing

In [None]:
# Loading data from a CSV file into a pandas DataFrame
data = pd.read_csv('../resources/dataset.csv')

In [None]:
# Ensuring all text reviews are of string type for consistent processing
X = data['review_text'].astype(str)

# Converting negative review scores to zero as part of data cleaning
data['review_score'] = data['review_score'].replace(-1, 0)

# The target variable indicating if the review is positive or negative
y = data['review_score']

In [None]:
def preprocess_text(text):
    # Preprocessing function to clean and prepare text data
    if not isinstance(text, str):
        return ''
    
    tokens = word_tokenize(text.lower())  # Tokenizing text and converting to lower case

    tokens = [word for word in tokens if word.isalpha()]  # Removing punctuation and non-alphabetic characters

    stop_words = set(stopwords.words('vietnamese'))  # Setting stopwords for Vietnamese (replace if using another language)
    tokens = [word for word in tokens if word not in stop_words]  # Removing stopwords from tokens

    stemmer = PorterStemmer()  # Initializing stemmer to reduce words to their root form
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming each word in tokens

    return ' '.join(tokens)  # Joining processed tokens back into a single string

# Applying text preprocessing to review texts
X = X.apply(preprocess_text)

In [None]:
# Choosing between CountVectorizer (Bag of Words) or TfidfVectorizer (TF-IDF features)
vectorizer = TfidfVectorizer()  # Initializing the TfidfVectorizer
X_vectorized = vectorizer.fit_transform(X)  # Transforming the text to a vectorized form

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [None]:
# Initializing machine learning models to be trained
models = {
    'LogisticRegression': LogisticRegression(),
    'SVM': SVC(probability=True),
    'RandomForest': RandomForestClassifier(),
    'GBM': GradientBoostingClassifier(),
    'NaiveBayes': MultinomialNB()
}

results = {}  # Dictionary to store results of model evaluation

# Looping through the models, training them, and evaluating performance
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Training the model
    y_pred = model.predict(X_test)  # Making predictions on the test set
    # Getting probabilities for the positive class or decision function scores
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    
    # Calculating different performance metrics for model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = np.trapz(recall_curve, precision_curve)
    conf_matrix = confusion_matrix(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    cross_entropy_loss = log_loss(y_test, y_proba)
    
    # Storing the calculated metrics in the results dictionary
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC-ROC': roc_auc,
        'AUC-PR': auc_pr,
        'Confusion Matrix': conf_matrix,
        'MCC': mcc,
        'Log Loss': cross_entropy_loss,
    }

# Printing out the results for each model
for model_name, metrics in results.items():
    print(f"{model_name} results:")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

In [None]:
# Converting sparse matrices to dense arrays if necessary for neural network input
X_train_dense = X_train.toarray() if 'toarray' in dir(X_train) else X_train
X_test_dense = X_test.toarray() if 'toarray' in dir(X_test) else X_test

# Defining a simple neural network model using Keras
nn_model = Sequential()
nn_model.add(Dense(64, activation='relu', input_shape=(X_train_dense.shape[1],)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the neural network model
nn_model.fit(X_train_dense, y_train, epochs=100, batch_size=10, verbose=0)

# Evaluating the neural network model
y_pred_nn = (nn_model.predict(X_test_dense) > 0.5).astype(int)
y_proba_nn = nn_model.predict(X_test_dense).flatten()

# Calculating performance metrics for the neural network model
nn_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_nn),
    'Precision': precision_score(y_test, y_pred_nn),
    'Recall': recall_score(y_test, y_pred_nn),
    'F1 Score': f1_score(y_test, y_pred_nn),
    'AUC-ROC': roc_auc_score(y_test, y_proba_nn),
    'Confusion Matrix': confusion_matrix(y_test, y_pred_nn),
    'MCC': matthews_corrcoef(y_test, y_pred_nn),
    'Log Loss': log_loss(y_test, y_proba_nn),
}

results['NeuralNetwork'] = nn_metrics  # Adding neural network metrics to the results dictionary

# Printing out the neural network results
print("Neural Network results:")
for metric_name, metric_value in nn_metrics.items():
    print(f"{metric_name}: {metric_value}")
print("\n")