In [2]:
# import statements:
import pandas as pd
import numpy as np
import nltk
import joblib
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

In [3]:
# run this just in case
nltk.download('all')
nltk.download('stopwords')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/rohitkandala/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/rohitkandala/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/rohitkandala/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/rohitkandala/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/rohitkandala/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_

True

In [4]:
# reading in csv as dataframe:
yelp_data = pd.read_csv("yelp_true_sample_100k.csv")

In [5]:
# applying Summer's preprocess_text() function:
def preprocess_text(text):
    # tokenizing test, ensuring it is not case insensitive
    tokens = nltk.word_tokenize(text.lower())
    
    # removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # joining filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# apply function to 'text' column in yelp_data
yelp_data['text'] = yelp_data['text'].apply(preprocess_text)

In [6]:
# binary classification for yelp data based on star: 
yelp_data['sentiment'] = yelp_data['stars'].apply(lambda x: 0 if x < 3 else 1)

In [7]:
# Setting up features & target variable:
X = yelp_data['text']
y = yelp_data['sentiment']

In [8]:
# Splitting the data:
# 80% training data and 20% testing data; random state of 42--can pick any integer:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Converting text data to numerical data using tfidf_vectorizer:
tfidf_vectorizer = TfidfVectorizer()

# Fit TF-IDF vectorizer and fit to the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform testing data using the same vectorizer to prevent data leakage
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Initialize logistic regression
logistic_clf = LogisticRegression(max_iter = 10000)

# Define hyperparameters to tune using grid search
param_grid = {
    'C': [0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 5.0, 10.0],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2', 'None'], 
    'class_weight': ['None', 'balanced']
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=logistic_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Create a new logistic regression model using the best parameters
best_logistic_clf = LogisticRegression(C=best_params['C'], solver=best_params['solver'])

# Fit the best logistic regression model to the training data
best_logistic_clf.fit(X_train_tfidf, y_train)

# Use the best logistic regression model to predict on the testing data
y_pred = best_logistic_clf.predict(X_test_tfidf)

360 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rohitkandala/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rohitkandala/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/rohitkandala/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'non

In [11]:
print(best_params)

{'C': 3.0, 'class_weight': 'None', 'penalty': 'l2', 'solver': 'lbfgs'}


In [12]:
# checking accuracy:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.92755


In [13]:
# Calculating AUC Score:
auc_roc = roc_auc_score(y_test, y_pred)
print('AUC-ROC:', auc_roc)

AUC-ROC: 0.8861804797030562


In [14]:
# dump model into file that can be reloaded:
joblib.dump(best_logistic_clf, 'best_logistic_clf.pkl')

['best_logistic_clf.pkl']