## Yelp Data: Support Vector Machines


In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

#suppress warnings
import sys
import warnings
import os

#Different attempts to suppress ConvergenceWarning when max_iter terminates
#before total model convergence
#https://stackoverflow.com/questions/53784971/how-to-disable-convergencewarning-using-sklearn
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')

warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.filterwarnings('ignore')

In [None]:
#ALL PIPELINE CODE BELOW BY: Summer Long
#Added and modified print statements somewhat to fit into single cell

# run this just in case
#nltk.download('all')
nltk.download('stopwords')

### Pre-process and run TF-IDF

In [6]:

#Summer Long code ct'd. 

# some models are tolerant of class imbalance
# also, the other dataset is now imbalanced due to category selection
# might as well use the naturally imbalanced one
yelp_data = pd.read_csv("yelp_true_sample_100k.csv")

def preprocess_text(text):
    # tokenizing test, ensuring it is not case insensitive
    tokens = nltk.word_tokenize(text.lower())
    
    # removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # joining filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# apply function to 'text' column in yelp_data
print("Pre-processing yelp data text...")
yelp_data['text'] = yelp_data['text'].apply(preprocess_text)

# create binary category based on star rating
print("Creating binary category based on star rating...")
yelp_data['sentiment'] = yelp_data['stars'].apply(lambda x: 0 if x <= 2 else 1)
print("Binary category complete")

#positive class rate
positive_class_rate = (yelp_data['sentiment'].sum()/100000) * 100
print(f"\nThe positive class rate is: {positive_class_rate}")

#negative class rate
negative_class_rate = 100 - (yelp_data['sentiment'].sum()/100000) * 100
print(f"The negative class rate is: {negative_class_rate}\n")

# Splitting into train/test with 80/20 split
TEST_SIZE = 0.2
print("Splitting into train and test data...")
print(f"Training data = {1 - TEST_SIZE}, test data = {TEST_SIZE}")
X_train, X_test, y_train, y_test = train_test_split(yelp_data['text'], 
                                                    yelp_data['sentiment'], 
                                                    test_size=TEST_SIZE, 
                                                    random_state=123)

print("Applying TF-IDF vectorizer to training data...")
# Define TF-IDF vectorizer and fit to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

print("Applying TF-IDF vector to test data...")
# Transform testing data using the same vectorizer to prevent data leakage
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Pipeline complete")


Pre-processing yelp data text...
Creating binary category based on star rating...
Binary category complete

The positive class rate is: 76.912
The negative class rate is: 23.087999999999994

Splitting into train and test data...
Training data = 0.8, test data = 0.2
Applying TF-IDF vectorizer to training data...
Applying TF-IDF vector to test data...
Pipeline complete


In [39]:
#Checking size of the object

X_test_tfidf

<20000x65910 sparse matrix of type '<class 'numpy.float64'>'
	with 899405 stored elements in Compressed Sparse Row format>

In [7]:
# Use best hyperparameters from grid search (other file) 
#to create the best SVM classifier
#Manually inputted so you can skip the grid search
best_svc = SVC(C=1.0,
               gamma='scale',
               kernel='rbf',
               max_iter=10000,
               probability=True,
               verbose=True)

# Fit the best SVC to the training data
best_svc.fit(X_train_tfidf, y_train)

# Predict sentiment of the testing data with model
y_pred = best_svc.predict(X_test_tfidf)

# Calculate the raw accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# create and print classification report of model
# this allows us to evaluate the performance of both classes
# this is important due to class imbalance
print(classification_report(y_test, y_pred))

# predict probabilities for the test data using the SVC classifier
#TODO: Determine whether this is apposite or another method is needed
y_pred_proba = best_svc.predict_proba(X_test_tfidf)[:, 1]

# calculate the AUC-ROC metric
# this is valuable due to class imbalance
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc_roc}")

#RUNTIME: 51m 49s

[LibSVM].........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -9051.592134, rho = -0.550468
nSV = 17756, nBSV = 10578
Total nSV = 17756
.........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -9077.576950, rho = -0.544317
nSV = 17807, nBSV = 10718
Total nSV = 17807
.........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -8975.722721, rho = -0.562294
nSV = 17710, nBSV = 10451
Total nSV = 17710
.........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -9103.715108, rho = -0.541533
nSV = 17816, nBSV = 10747
Total nSV = 17816
.........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -9034.102587, rho = -0.327828
nSV = 17735, nBSV = 10534
Total nSV = 17735
.........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -10710.578558, rho = 0.505837
nSV = 19022, nBSV = 13951
Total nSV = 19022
Accuracy: 0.92

In [9]:
#dump model so it can be reloaded
import joblib
joblib.dump(best_svc, 'best_svm.pkl')

['best_svm.pkl']

### Full model readout:


In [None]:

#Hyperparameters:
#{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 10000}

# Accuracy: 0.92375
#               precision    recall  f1-score   support

#            0       0.86      0.81      0.83      4669
#            1       0.94      0.96      0.95     15331

#     accuracy                           0.92     20000
#    macro avg       0.90      0.88      0.89     20000
# weighted avg       0.92      0.92      0.92     20000

# AUC-ROC: 0.9680602266214099