In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# run this just in case
nltk.download("all")
nltk.download("stopwords")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/summerlong/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/summerlong/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/summerlong/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/summerlong/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/summerlong/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    |

True

In [3]:
yelp_data = pd.read_csv("yelp_true_sample_100k.csv")

In [4]:
def preprocess_text(text):
    # tokenizing test, ensuring it is not case insensitive
    tokens = nltk.word_tokenize(text.lower())

    # removing stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # joining filtered tokens back into a string
    filtered_text = " ".join(filtered_tokens)

    return filtered_text


# apply function to 'text' column in yelp_data
yelp_data["text"] = yelp_data["text"].apply(preprocess_text)

In [5]:
# create binary category based on star rating
yelp_data["sentiment"] = yelp_data["stars"].apply(lambda x: 0 if x <= 2 else 1)

In [6]:
# Splitting into train/test with 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    yelp_data["text"], yelp_data["sentiment"], test_size=0.2, random_state=123
)

# Define TF-IDF vectorizer and fit to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform testing data using the same vectorizer to prevent data leakage
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [16]:
# Initialize random forest classifier
rf_clf = RandomForestClassifier()

# Define hyperparameters to tune using grid search
param_grid = {
    "max_depth": [3, 5, 7],
    "n_estimators": [10, 50, 100],
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Define grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use best hyperparameters to create the best random forest model
best_rf_clf = RandomForestClassifier(
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    max_features=best_params["max_features"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
)

# Fit the best random forest model to the training data
best_rf_clf.fit(X_train_tfidf, y_train)

# Predict sentiment of the testing data with model
y_pred = best_rf_clf.predict(X_test_tfidf)

In [20]:
print(best_params)

{'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}


In [21]:
# Calculate the raw accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7689


In [22]:
# create and print classification report of model
# this allows us to evaluate the performance of both classes
# this is important due to class imbalance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.01      0.02      4669
           1       0.77      1.00      0.87     15331

    accuracy                           0.77     20000
   macro avg       0.86      0.51      0.44     20000
weighted avg       0.81      0.77      0.67     20000



In [23]:
# predict probabilities for the test data using the XGBoost classifier
y_pred_proba = best_rf_clf.predict_proba(X_test_tfidf)[:, 1]

# calculate the AUC-ROC metric
# this is valuable due to class imbalance
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc_roc}")

AUC-ROC: 0.80943130985827


In [24]:
# dump model to file that can be reloaded

import joblib

joblib.dump(best_rf_clf, "randomforestv1.pkl")

['randomforestv1.pkl']

In [25]:
# attempt to improve with oversampling
from imblearn.over_sampling import SMOTE

# Create SMOTE object
smote = SMOTE()

# Oversample minority class
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

grid_searchv2 = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit grid search to the training data
grid_searchv2.fit(X_train_resampled, y_train_resampled)

# Get best hyperparameters from the grid search
best_paramsv2 = grid_search.best_params_

# Use best hyperparameters to create the best random forest model
best_rf_clfv2 = RandomForestClassifier(
    max_depth=best_paramsv2["max_depth"],
    n_estimators=best_paramsv2["n_estimators"],
    max_features=best_paramsv2["max_features"],
    min_samples_split=best_paramsv2["min_samples_split"],
    min_samples_leaf=best_paramsv2["min_samples_leaf"],
)

# Fit the best random forest model to the training data
best_rf_clfv2.fit(X_train_resampled, y_train_resampled)

# Predict sentiment of the testing data with model
y_predv2 = best_rf_clfv2.predict(X_test_tfidf)

In [26]:
# Calculate the raw accuracy of the model
accuracyv2 = accuracy_score(y_test, y_predv2)
print(f"Accuracy: {accuracyv2}")

Accuracy: 0.78855


In [32]:
print(best_paramsv2)

{'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}


In [27]:
# create and print classification report of model
# this allows us to evaluate the performance of both classes
# this is important due to class imbalance
print(classification_report(y_test, y_predv2))

              precision    recall  f1-score   support

           0       0.55      0.50      0.53      4669
           1       0.85      0.88      0.86     15331

    accuracy                           0.79     20000
   macro avg       0.70      0.69      0.69     20000
weighted avg       0.78      0.79      0.78     20000



In [29]:
# predict probabilities for the test data using the XGBoost classifier
y_pred_probav2 = best_rf_clfv2.predict_proba(X_test_tfidf)[:, 1]

# calculate the AUC-ROC metric
# this is valuable due to class imbalance
auc_rocv2 = roc_auc_score(y_test, y_pred_probav2)
print(f"AUC-ROC: {auc_rocv2}")

AUC-ROC: 0.7853279161922995


In [31]:
import joblib

joblib.dump(best_rf_clfv2, "randomforestv2.pkl")

['randomforestv2.pkl']