In [34]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [None]:
# run this just in case
nltk.download("all")
nltk.download("stopwords")

In [17]:
# some models are tolerant of class imbalance
# also, the other dataset is now imbalanced due to category selection
# might as well use the naturally imbalanced one
yelp_data = pd.read_csv("yelp_true_sample_100k.csv")

In [22]:
def preprocess_text(text):
    # tokenizing test, ensuring it is not case insensitive
    tokens = nltk.word_tokenize(text.lower())

    # removing stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # joining filtered tokens back into a string
    filtered_text = " ".join(filtered_tokens)

    return filtered_text


# apply function to 'text' column in yelp_data
yelp_data["text"] = yelp_data["text"].apply(preprocess_text)

In [23]:
# create binary category based on star rating
yelp_data["sentiment"] = yelp_data["stars"].apply(lambda x: 0 if x <= 2 else 1)

In [45]:
# positive class rate
(yelp_data["sentiment"].sum() / 100000) * 100

76.912

In [46]:
# negative class rate
100 - (yelp_data["sentiment"].sum() / 100000) * 100

23.087999999999994

In [30]:
# Splitting into train/test with 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    yelp_data["text"], yelp_data["sentiment"], test_size=0.2, random_state=123
)

# Define TF-IDF vectorizer and fit to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform testing data using the same vectorizer to prevent data leakage
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [31]:
# Initalize classifier
xgb_clf = XGBClassifier()

# Define hyperparameters to tune using grid search
param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.5],
    "n_estimators": [50, 100, 200],
}

# Define grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use best hyperparameters to create the best XGBoost model
best_xgb_clf = XGBClassifier(
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    n_estimators=best_params["n_estimators"],
)

# Fit the best XGBoost model to the training data
best_xgb_clf.fit(X_train_tfidf, y_train)

# Predict sentiment of the testing data with model
y_pred = best_xgb_clf.predict(X_test_tfidf)

In [37]:
# see which hyperparameters were selected as best
print(best_params)

{'learning_rate': 0.5, 'max_depth': 7, 'n_estimators': 200}


In [32]:
# Calculate the raw accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.91395


In [35]:
# create and print classification report of model
# this allows us to evaluate the performance of both classes
# this is important due to class imbalance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      4669
           1       0.93      0.96      0.94     15331

    accuracy                           0.91     20000
   macro avg       0.89      0.86      0.88     20000
weighted avg       0.91      0.91      0.91     20000



In [36]:
# predict probabilities for the test data using the XGBoost classifier
y_pred_proba = best_xgb_clf.predict_proba(X_test_tfidf)[:, 1]

# calculate the AUC-ROC metric
# this is valuable due to class imbalance
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc_roc}")

AUC-ROC: 0.9608737032193948


In [39]:
# dump model to file that can be reloaded

import joblib

joblib.dump(best_xgb_clf, "xgboostv1.pkl")

['xgboostv1.pkl']