## Cleaning Data

In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

# Load and Clean Steam Data
steam_data = pd.read_csv("Datasets/steam.csv")
steam_data = steam_data.dropna(subset=['review_text'])  # Drop rows with missing reviews
steam_data = steam_data[['review_text', 'review_score']]  # Keep only relevant columns
steam_data = steam_data.rename(columns={'review_text': 'review', 'review_score': 'sentiment'})
steam_data['sentiment'] = steam_data['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Load and Clean Yelp Data
yelp_data = pd.read_csv("Datasets/yelp.csv", header=None)  # Load Yelp Data without a header
yelp_data = yelp_data.iloc[:, :2]  # Select the first two columns
yelp_data.columns = ['review', 'sentiment']  # Rename columns
yelp_data['sentiment'] = pd.to_numeric(yelp_data['sentiment'], errors='coerce')  # Convert sentiment to numeric
valid_sentiments = {0: 'negative', 1: 'positive'}  # Map sentiment values
yelp_data['sentiment'] = yelp_data['sentiment'].map(valid_sentiments)
yelp_data = yelp_data.dropna(subset=['review', 'sentiment'])  # Drop rows with missing or invalid values

# Load and Clean IMDb Data
imdb_data = pd.read_csv("Datasets/imdb.csv")
imdb_data = imdb_data[['review', 'sentiment']]  # Keep only relevant columns

# Load and Clean Amazon Data
amazon_data = pd.read_csv("Datasets/amazon.csv")
amazon_data = amazon_data[['Text', 'Score']]  # Keep only relevant columns
amazon_data = amazon_data.rename(columns={'Text': 'review'})  # Rename columns
amazon_data['sentiment'] = amazon_data['Score'].apply(lambda x: 'positive' if x > 3 else 'negative')
amazon_data = amazon_data.drop(columns=['Score'])  # Drop Score after processing

# Save Cleaned Datasets
steam_data.to_csv("Datasets/cleaned_steam.csv", index=False)
yelp_data.to_csv("Datasets/cleaned_yelp.csv", index=False)
imdb_data.to_csv("Datasets/cleaned_imdb.csv", index=False)
amazon_data.to_csv("Datasets/cleaned_amazon.csv", index=False)


# Tokenization

In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

# Tokenization Function with Progress Logging
def tokenize_reviews_nltk(df, review_column):
    total_rows = len(df)
    print(f"Starting tokenization for {total_rows} rows with NLTK...")
    for idx in range(total_rows):
        if idx % 1000 == 0 or idx == total_rows - 1:  # Progress log every 1000 rows or on the last row
            print(f"Tokenizing row {idx + 1} of {total_rows}...")
        df.loc[idx, review_column] = word_tokenize(str(df.loc[idx, review_column]).lower())
    return df

# Tokenize Cleaned Steam Data
print("Loading cleaned Steam data...")
steam_data = pd.read_csv("Datasets/cleaned_steam.csv")
print("Tokenizing Steam data with NLTK...")
steam_data = tokenize_reviews_nltk(steam_data, 'review')
print("Steam data tokenization complete!")

# Tokenize Cleaned Yelp Data
print("Loading cleaned Yelp data...")
yelp_data = pd.read_csv("Datasets/cleaned_yelp.csv")
print("Tokenizing Yelp data with NLTK...")
yelp_data = tokenize_reviews_nltk(yelp_data, 'review')
print("Yelp data tokenization complete!")

# Tokenize Cleaned IMDb Data
print("Loading cleaned IMDb data...")
imdb_data = pd.read_csv("Datasets/cleaned_imdb.csv")
print("Tokenizing IMDb data with NLTK...")
imdb_data = tokenize_reviews_nltk(imdb_data, 'review')
print("IMDb data tokenization complete!")

# Tokenize Cleaned Amazon Data
print("Loading cleaned Amazon data...")
amazon_data = pd.read_csv("Datasets/cleaned_amazon.csv")
print("Tokenizing Amazon data with NLTK...")
amazon_data = tokenize_reviews_nltk(amazon_data, 'review')
print("Amazon data tokenization complete!")

# Save Tokenized Data to CSV
print("Saving tokenized data...")
steam_data.to_csv("Datasets/tokenized_steam.csv", index=False)
yelp_data.to_csv("Datasets/tokenized_yelp.csv", index=False)
imdb_data.to_csv("Datasets/tokenized_imdb.csv", index=False)
amazon_data.to_csv("Datasets/tokenized_amazon.csv", index=False)
print("All tokenized data saved!")


Loading cleaned Steam data...
Tokenizing Steam data with NLTK...
Starting tokenization for 6409801 rows with NLTK...
Tokenizing row 1 of 6409801...
Tokenizing row 1001 of 6409801...
Tokenizing row 2001 of 6409801...
Tokenizing row 3001 of 6409801...
Tokenizing row 4001 of 6409801...
Tokenizing row 5001 of 6409801...
Tokenizing row 6001 of 6409801...
Tokenizing row 7001 of 6409801...
Tokenizing row 8001 of 6409801...
Tokenizing row 9001 of 6409801...
Tokenizing row 10001 of 6409801...
Tokenizing row 11001 of 6409801...
Tokenizing row 12001 of 6409801...
Tokenizing row 13001 of 6409801...
Tokenizing row 14001 of 6409801...
Tokenizing row 15001 of 6409801...
Tokenizing row 16001 of 6409801...
Tokenizing row 17001 of 6409801...
Tokenizing row 18001 of 6409801...
Tokenizing row 19001 of 6409801...
Tokenizing row 20001 of 6409801...
Tokenizing row 21001 of 6409801...
Tokenizing row 22001 of 6409801...
Tokenizing row 23001 of 6409801...
Tokenizing row 24001 of 6409801...
Tokenizing row 25001 

## TF-IDF Matrices and Vectorizers

In [5]:
#####################
####################
##TF-ID##
################
############
import pandas as pd
# Load tokenized datasets
print("Loading tokenized Steam data...")
steam_data = pd.read_csv("Datasets/tokenized_steam.csv")
print("Loading tokenized Yelp data...")
yelp_data = pd.read_csv("Datasets/tokenized_yelp.csv")
print("Loading tokenized IMDb data...")
imdb_data = pd.read_csv("Datasets/tokenized_imdb.csv")
print("Loading tokenized Amazon data...")
amazon_data = pd.read_csv("Datasets/tokenized_amazon.csv")

print("Converting tokenized reviews to strings...")
# steam_data['review'] = steam_data['review'].apply(lambda tokens: ' '.join(eval(tokens)))
yelp_data['review'] = yelp_data['review'].apply(lambda tokens: ' '.join(eval(tokens)))
imdb_data['review'] = imdb_data['review'].apply(lambda tokens: ' '.join(eval(tokens)))
amazon_data['review'] = amazon_data['review'].apply(lambda tokens: ' '.join(eval(tokens)))

from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_reviews(data, review_column, max_features=5000):
    print(f"Applying TF-IDF vectorization (max_features={max_features})...")
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(data[review_column])
    print("TF-IDF vectorization complete!")
    return tfidf_matrix, vectorizer

# Vectorize each dataset
print("Vectorizing Steam data...")
steam_tfidf, steam_vectorizer = vectorize_reviews(steam_data, 'review')

print("Vectorizing Yelp data...")
yelp_tfidf, yelp_vectorizer = vectorize_reviews(yelp_data, 'review')

print("Vectorizing IMDb data...")
imdb_tfidf, imdb_vectorizer = vectorize_reviews(imdb_data, 'review')

print("Vectorizing Amazon data...")
amazon_tfidf, amazon_vectorizer = vectorize_reviews(amazon_data, 'review')

import joblib

# Save TF-IDF matrices and vectorizers
print("Saving TF-IDF matrices and vectorizers...")
joblib.dump(steam_tfidf, "Datasets/steam_tfidf.pkl")
joblib.dump(steam_vectorizer, "Datasets/steam_vectorizer.pkl")
joblib.dump(yelp_tfidf, "Datasets/yelp_tfidf.pkl")
joblib.dump(yelp_vectorizer, "Datasets/yelp_vectorizer.pkl")
joblib.dump(imdb_tfidf, "Datasets/imdb_tfidf.pkl")
joblib.dump(imdb_vectorizer, "Datasets/imdb_vectorizer.pkl")
joblib.dump(amazon_tfidf, "Datasets/amazon_tfidf.pkl")
joblib.dump(amazon_vectorizer, "Datasets/amazon_vectorizer.pkl")
print("TF-IDF data saved!")


Loading tokenized Steam data...
Loading tokenized Yelp data...
Loading tokenized IMDb data...
Loading tokenized Amazon data...
Converting tokenized reviews to strings...
Vectorizing Steam data...
Applying TF-IDF vectorization (max_features=5000)...
TF-IDF vectorization complete!
Vectorizing Yelp data...
Applying TF-IDF vectorization (max_features=5000)...
TF-IDF vectorization complete!
Vectorizing IMDb data...
Applying TF-IDF vectorization (max_features=5000)...
TF-IDF vectorization complete!
Vectorizing Amazon data...
Applying TF-IDF vectorization (max_features=5000)...
TF-IDF vectorization complete!
Saving TF-IDF matrices and vectorizers...
TF-IDF data saved!


## Data Split

In [1]:
#####################
####################
##DATA SPLIT##
################
############

from sklearn.model_selection import train_test_split
import joblib
import pandas as pd

# Load TF-IDF matrix and sentiment labels
print("Loading TF-IDF matrix and labels for Steam data...")
steam_tfidf = joblib.load("Datasets/steam_tfidf.pkl")
steam_data = pd.read_csv("Datasets/tokenized_steam.csv")
steam_labels = steam_data['sentiment'].map({'positive': 1, 'negative': 0})  # Map labels to binary values

# Split Steam Data
print("Splitting Steam data into training and testing sets...")
X_train_steam, X_test_steam, y_train_steam, y_test_steam = train_test_split(
    steam_tfidf, steam_labels, test_size=0.2, random_state=42
)
print(f"Steam data split: {X_train_steam.shape[0]} training rows, {X_test_steam.shape[0]} testing rows.")

# Repeat for Yelp Data
print("Loading TF-IDF matrix and labels for Yelp data...")
yelp_tfidf = joblib.load("Datasets/yelp_tfidf.pkl")
yelp_data = pd.read_csv("Datasets/tokenized_yelp.csv")
yelp_labels = yelp_data['sentiment'].map({'positive': 1, 'negative': 0})

print("Splitting Yelp data into training and testing sets...")
X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp = train_test_split(
    yelp_tfidf, yelp_labels, test_size=0.2, random_state=42
)
print(f"Yelp data split: {X_train_yelp.shape[0]} training rows, {X_test_yelp.shape[0]} testing rows.")

# Repeat for IMDb Data
print("Loading TF-IDF matrix and labels for IMDb data...")
imdb_tfidf = joblib.load("Datasets/imdb_tfidf.pkl")
imdb_data = pd.read_csv("Datasets/tokenized_imdb.csv")
imdb_labels = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

print("Splitting IMDb data into training and testing sets...")
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(
    imdb_tfidf, imdb_labels, test_size=0.2, random_state=42
)
print(f"IMDb data split: {X_train_imdb.shape[0]} training rows, {X_test_imdb.shape[0]} testing rows.")

# Repeat for Amazon Data
print("Loading TF-IDF matrix and labels for Amazon data...")
amazon_tfidf = joblib.load("Datasets/amazon_tfidf.pkl")
amazon_data = pd.read_csv("Datasets/tokenized_amazon.csv")
amazon_labels = amazon_data['sentiment'].map({'positive': 1, 'negative': 0})

print("Splitting Amazon data into training and testing sets...")
X_train_amazon, X_test_amazon, y_train_amazon, y_test_amazon = train_test_split(
    amazon_tfidf, amazon_labels, test_size=0.2, random_state=42
)
print(f"Amazon data split: {X_train_amazon.shape[0]} training rows, {X_test_amazon.shape[0]} testing rows.")


Loading TF-IDF matrix and labels for Steam data...
Splitting Steam data into training and testing sets...
Steam data split: 5127840 training rows, 1281961 testing rows.
Loading TF-IDF matrix and labels for Yelp data...
Splitting Yelp data into training and testing sets...
Yelp data split: 562 training rows, 141 testing rows.
Loading TF-IDF matrix and labels for IMDb data...
Splitting IMDb data into training and testing sets...
IMDb data split: 40000 training rows, 10000 testing rows.
Loading TF-IDF matrix and labels for Amazon data...
Splitting Amazon data into training and testing sets...
Amazon data split: 454763 training rows, 113691 testing rows.


# Model Training

## Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train and Evaluate a Model
def train_and_evaluate_model(X_train, X_test, y_train, y_test, dataset_name):
    print(f"Training Logistic Regression model for {dataset_name}...")
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    print("Evaluating model...")
    y_pred = model.predict(X_test)

    # Model Performance Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{dataset_name} Accuracy: {accuracy:.4f}")

    print(f"Classification Report for {dataset_name}:\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {dataset_name}:\n", confusion_matrix(y_test, y_pred))

    return model

# Train and Evaluate on Steam Data
print("Processing Steam data...")
steam_model = train_and_evaluate_model(X_train_steam, X_test_steam, y_train_steam, y_test_steam, "Steam")

# Train and Evaluate on Yelp Data
print("Processing Yelp data...")
yelp_model = train_and_evaluate_model(X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp, "Yelp")

# Train and Evaluate on IMDb Data
print("Processing IMDb data...")
imdb_model = train_and_evaluate_model(X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb, "IMDb")

# Train and Evaluate on Amazon Data
print("Processing Amazon data...")
amazon_model = train_and_evaluate_model(X_train_amazon, X_test_amazon, y_train_amazon, y_test_amazon, "Amazon")


Processing Steam data...
Training Logistic Regression model for Steam...
Evaluating model...
Steam Accuracy: 0.8846
Classification Report for Steam:
               precision    recall  f1-score   support

           0       0.78      0.50      0.61    230912
           1       0.90      0.97      0.93   1051049

    accuracy                           0.88   1281961
   macro avg       0.84      0.74      0.77   1281961
weighted avg       0.88      0.88      0.87   1281961

Confusion Matrix for Steam:
 [[ 115795  115117]
 [  32782 1018267]]
Processing Yelp data...
Training Logistic Regression model for Yelp...
Evaluating model...
Yelp Accuracy: 0.7943
Classification Report for Yelp:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79        71
           1       0.78      0.81      0.80        70

    accuracy                           0.79       141
   macro avg       0.79      0.79      0.79       141
weighted avg       0.79      0.79    

## Naive Bayes

In [3]:

from sklearn.naive_bayes import MultinomialNB

# Train and Evaluate a Naive Bayes Model
def train_and_evaluate_naive_bayes(X_train, X_test, y_train, y_test, dataset_name):
    print(f"Training Naive Bayes model for {dataset_name}...")
    model = MultinomialNB()
    model.fit(X_train, y_train)

    print("Evaluating model...")
    y_pred = model.predict(X_test)

    # Model Performance Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{dataset_name} Naive Bayes Accuracy: {accuracy:.4f}")

    print(f"Classification Report for {dataset_name}:\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {dataset_name}:\n", confusion_matrix(y_test, y_pred))

    return model

# Train and Evaluate on Steam Data
print("Processing Steam data with Naive Bayes...")
steam_nb_model = train_and_evaluate_naive_bayes(X_train_steam, X_test_steam, y_train_steam, y_test_steam, "Steam")

# Train and Evaluate on Yelp Data
print("Processing Yelp data with Naive Bayes...")
yelp_nb_model = train_and_evaluate_naive_bayes(X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp, "Yelp")

# Train and Evaluate on IMDb Data
print("Processing IMDb data with Naive Bayes...")
imdb_nb_model = train_and_evaluate_naive_bayes(X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb, "IMDb")

# Train and Evaluate on Amazon Data
print("Processing Amazon data with Naive Bayes...")
amazon_nb_model = train_and_evaluate_naive_bayes(X_train_amazon, X_test_amazon, y_train_amazon, y_test_amazon, "Amazon")


Processing Steam data with Naive Bayes...
Training Naive Bayes model for Steam...
Evaluating model...
Steam Naive Bayes Accuracy: 0.8496
Classification Report for Steam:
               precision    recall  f1-score   support

           0       0.86      0.20      0.32    230912
           1       0.85      0.99      0.92   1051049

    accuracy                           0.85   1281961
   macro avg       0.86      0.59      0.62   1281961
weighted avg       0.85      0.85      0.81   1281961

Confusion Matrix for Steam:
 [[  45403  185509]
 [   7284 1043765]]
Processing Yelp data with Naive Bayes...
Training Naive Bayes model for Yelp...
Evaluating model...
Yelp Naive Bayes Accuracy: 0.7518
Classification Report for Yelp:
               precision    recall  f1-score   support

           0       0.79      0.69      0.74        71
           1       0.72      0.81      0.77        70

    accuracy                           0.75       141
   macro avg       0.76      0.75      0.75      

## SVM

In [4]:
from sklearn.svm import LinearSVC

# Train and Evaluate an SVM Model
def train_and_evaluate_svm(X_train, X_test, y_train, y_test, dataset_name):
    print(f"Training SVM model for {dataset_name}...")
    model = LinearSVC(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)

    print("Evaluating model...")
    y_pred = model.predict(X_test)

    # Model Performance Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{dataset_name} SVM Accuracy: {accuracy:.4f}")

    print(f"Classification Report for {dataset_name}:\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {dataset_name}:\n", confusion_matrix(y_test, y_pred))

    return model

# Train and Evaluate on Steam Data
print("Processing Steam data with SVM...")
steam_svm_model = train_and_evaluate_svm(X_train_steam, X_test_steam, y_train_steam, y_test_steam, "Steam")

# Train and Evaluate on Yelp Data
print("Processing Yelp data with SVM...")
yelp_svm_model = train_and_evaluate_svm(X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp, "Yelp")

# Train and Evaluate on IMDb Data
print("Processing IMDb data with SVM...")
imdb_svm_model = train_and_evaluate_svm(X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb, "IMDb")

# Train and Evaluate on Amazon Data
print("Processing Amazon data with SVM...")
amazon_svm_model = train_and_evaluate_svm(X_train_amazon, X_test_amazon, y_train_amazon, y_test_amazon, "Amazon")


Processing Steam data with SVM...
Training SVM model for Steam...
Evaluating model...
Steam SVM Accuracy: 0.8850
Classification Report for Steam:
               precision    recall  f1-score   support

           0       0.79      0.49      0.61    230912
           1       0.90      0.97      0.93   1051049

    accuracy                           0.89   1281961
   macro avg       0.84      0.73      0.77   1281961
weighted avg       0.88      0.89      0.87   1281961

Confusion Matrix for Steam:
 [[ 113570  117342]
 [  30039 1021010]]
Processing Yelp data with SVM...
Training SVM model for Yelp...
Evaluating model...
Yelp SVM Accuracy: 0.7943
Classification Report for Yelp:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79        71
           1       0.77      0.84      0.80        70

    accuracy                           0.79       141
   macro avg       0.80      0.79      0.79       141
weighted avg       0.80      0.79      0.79

# Evaluate Results

In [5]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Function to evaluate and store metrics
def evaluate_model(model, X_test, y_test, dataset_name, model_name):
    print(f"Evaluating {model_name} on {dataset_name} data...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Get as dictionary

    return {
        "Dataset": dataset_name,
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision (0)": report['0']['precision'],
        "Recall (0)": report['0']['recall'],
        "F1-Score (0)": report['0']['f1-score'],
        "Precision (1)": report['1']['precision'],
        "Recall (1)": report['1']['recall'],
        "F1-Score (1)": report['1']['f1-score']
    }

# Collect results for all models and datasets
results = []

# Evaluate Steam Models
results.append(evaluate_model(steam_model, X_test_steam, y_test_steam, "Steam", "Logistic Regression"))
results.append(evaluate_model(steam_nb_model, X_test_steam, y_test_steam, "Steam", "Naive Bayes"))
results.append(evaluate_model(steam_svm_model, X_test_steam, y_test_steam, "Steam", "SVM"))

# Evaluate Yelp Models
results.append(evaluate_model(yelp_model, X_test_yelp, y_test_yelp, "Yelp", "Logistic Regression"))
results.append(evaluate_model(yelp_nb_model, X_test_yelp, y_test_yelp, "Yelp", "Naive Bayes"))
results.append(evaluate_model(yelp_svm_model, X_test_yelp, y_test_yelp, "Yelp", "SVM"))

# Evaluate IMDb Models
results.append(evaluate_model(imdb_model, X_test_imdb, y_test_imdb, "IMDb", "Logistic Regression"))
results.append(evaluate_model(imdb_nb_model, X_test_imdb, y_test_imdb, "IMDb", "Naive Bayes"))
results.append(evaluate_model(imdb_svm_model, X_test_imdb, y_test_imdb, "IMDb", "SVM"))

# Evaluate Amazon Models
results.append(evaluate_model(amazon_model, X_test_amazon, y_test_amazon, "Amazon", "Logistic Regression"))
results.append(evaluate_model(amazon_nb_model, X_test_amazon, y_test_amazon, "Amazon", "Naive Bayes"))
results.append(evaluate_model(amazon_svm_model, X_test_amazon, y_test_amazon, "Amazon", "SVM"))

# Create a summary table
results_df = pd.DataFrame(results)
print("\nModel Performance Summary:")
print(results_df)

# Save results to CSV for future reference
results_df.to_csv("Model_Performance_Summary.csv", index=False)

#The F1 score is a performance metric that combines precision and recall into a single value.
# $It is particularly useful when you want a balance between the two
#1 Score ranges from 0 to 1:
#1: Perfect balance between precision and recall.
#0: No true positives at all.


Evaluating Logistic Regression on Steam data...
Evaluating Naive Bayes on Steam data...
Evaluating SVM on Steam data...
Evaluating Logistic Regression on Yelp data...
Evaluating Naive Bayes on Yelp data...
Evaluating SVM on Yelp data...
Evaluating Logistic Regression on IMDb data...
Evaluating Naive Bayes on IMDb data...
Evaluating SVM on IMDb data...
Evaluating Logistic Regression on Amazon data...
Evaluating Naive Bayes on Amazon data...
Evaluating SVM on Amazon data...

Model Performance Summary:
   Dataset                Model  Accuracy  Precision (0)  Recall (0)  \
0    Steam  Logistic Regression  0.884631       0.779360    0.501468   
1    Steam          Naive Bayes  0.849611       0.861750    0.196625   
2    Steam                  SVM  0.885035       0.790828    0.491832   
3     Yelp  Logistic Regression  0.794326       0.808824    0.774648   
4     Yelp          Naive Bayes  0.751773       0.790323    0.690141   
5     Yelp                  SVM  0.794326       0.828125    0.7

# Experimenting with Hyperparameters

## Optimizing Logistic Regression

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # regularization limit
    'solver': ['liblinear', 'lbfgs', 'saga'],  # solvers that support regularization
    'max_iter': [100, 500, 1000]  # convergence limit
}

#### Steam

In [3]:
from sklearn.linear_model import LogisticRegression
# grid search for steam
model = LogisticRegression(random_state=42)
gs_steam = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
gs_steam.fit(X_train_steam, y_train_steam)

print("-----------STEAM LOGISTIC REGRESSION-----------")
print("Best Parameters: ", gs_steam.best_params_)
print("Best Score: ", gs_steam.best_score_)

NameError: name 'X_train_steam' is not defined

#### Yelp

In [9]:
# grid search for yelp
model = LogisticRegression(random_state=42)
gs_yelp = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
gs_yelp.fit(X_train_yelp, y_train_yelp)

print("-----------YELP LOGISTIC REGRESSION-----------")
print("Best Parameters: ", gs_yelp.best_params_)
print("Best Score: ", gs_yelp.best_score_)

-----------YELP LOGISTIC REGRESSION-----------
Best Parameters:  {'C': 100, 'max_iter': 100, 'solver': 'saga'}
Best Score:  0.7775442477876107




#### IMDB

In [35]:
# grid search for imdb
model = LogisticRegression(random_state=42)
gs_imdb = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
gs_imdb.fit(X_train_imdb, y_train_imdb)

print("-----------IMDB LOGISTIC REGRESSION-----------")
print("Best Parameters: ", gs_imdb.best_params_)
print("Best Score: ", gs_imdb.best_score_)

-----------IMDB LOGISTIC REGRESSION-----------
Best Parameters:  {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'}
Best Score:  0.8881499999999999


#### Amazon

In [38]:
# grid search for amazon
model = LogisticRegression(random_state=42)
gs_amazon = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
gs_amazon.fit(X_train_amazon, y_train_amazon)

print("-----------AMAZON LOGISTIC REGRESSION-----------")
print("Best Parameters: ", gs_amazon.best_params_)
print("Best Score: ", gs_amazon.best_score_)

-----------AMAZON LOGISTIC REGRESSION-----------
Best Parameters:  {'C': 100, 'max_iter': 100, 'solver': 'liblinear'}
Best Score:  0.8967792021336269


## Optimizing Naive Bayes

In [2]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [1e-2, 1e-1, 1, 1e1, 1e2], # smoothing parameter
    'fit_prior': [True, False], # use or ignore class priors
}

#### Steam

In [None]:
model = MultinomialNB(random_state=42)
grid_search_steam = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_steam.fit(X_train_steam, y_train_steam)

print("-----------STEAM NB-----------")
print("Best Parameters Steam: ", grid_search_steam.best_params_)
print("Best Score: ", grid_search_steam.best_score_)

#### Yelp

In [41]:
model = MultinomialNB()
grid_search_yelp = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_yelp.fit(X_train_yelp, y_train_yelp)

print("-----------YELP NB-----------")
print("Best Parameters Yelp: ", grid_search_yelp.best_params_)
print("Best Score: ", grid_search_yelp.best_score_)

-----------YELP NB-----------
Best Parameters Yelp:  {'alpha': 1, 'fit_prior': True}
Best Score:  0.7721713021491783


#### IMDB

In [42]:
model = MultinomialNB()
grid_search_imdb = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_imdb.fit(X_train_imdb, y_train_imdb)

print("-----------IMDB NB-----------")
print("Best Parameters IMDb: ", grid_search_imdb.best_params_)
print("Best Score: ", grid_search_imdb.best_score_)

-----------IMDB NB-----------
Best Parameters IMDb:  {'alpha': 1, 'fit_prior': False}
Best Score:  0.8533999999999999


#### Amazon

In [43]:
model = MultinomialNB()
grid_search_amazon = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_amazon.fit(X_train_amazon, y_train_amazon)

print("-----------AMAZON NB-----------")
print("Best Parameters Amazon: ", grid_search_amazon.best_params_)
print("Best Score: ", grid_search_amazon.best_score_)

-----------AMAZON NB-----------
Best Parameters Amazon:  {'alpha': 10.0, 'fit_prior': False}
Best Score:  0.8390612234941415


## Optimizing SVM

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # regularization limit
    'penalty': ['l1', 'l2'],
    'loss': ['squared_hinge'], # loss function
    'dual': [True, False],
    'max_iter': [100, 500, 1000, 5000]  # convergence limit
}

#### Steam

In [None]:
model = LinearSVC(random_state=42)
grid_search_steam = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_steam.fit(X_train_steam, y_train_steam)

print("-----------STEAM SVM-----------")
print("Best Parameters Steam: ", grid_search_steam.best_params_)
print("Best Score: ", grid_search_steam.best_score_)

#### Yelp

In [14]:
model = LinearSVC(random_state=42)
grid_search_yelp = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, error_score=0.0)
grid_search_yelp.fit(X_train_yelp, y_train_yelp)

print("-----------YELP SVM-----------")
print("Best Parameters Yelp: ", grid_search_yelp.best_params_)
print("Best Score: ", grid_search_yelp.best_score_)

-----------YELP SVM-----------
Best Parameters Yelp:  {'C': 1, 'dual': True, 'loss': 'squared_hinge', 'max_iter': 100, 'penalty': 'l2'}
Best Score:  0.7757743362831858


150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to 0.0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 317, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1214, in _fi

#### IMDB

In [15]:
model = LinearSVC(random_state=42)
grid_search_imdb = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_imdb.fit(X_train_imdb, y_train_imdb)

print("-----------IMDB SVM-----------")
print("Best Parameters IMDb: ", grid_search_imdb.best_params_)
print("Best Score: ", grid_search_imdb.best_score_)

150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 317, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1214, in _fi

-----------IMDB SVM-----------
Best Parameters IMDb:  {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'max_iter': 100, 'penalty': 'l2'}
Best Score:  0.8888


#### Amazon

In [17]:
model = LinearSVC(random_state=42)
grid_search_amazon = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search_amazon.fit(X_train_amazon, y_train_amazon)

print("-----------AMAZON SVM-----------")
print("Best Parameters Amazon: ", grid_search_amazon.best_params_)
print("Best Score: ", grid_search_amazon.best_score_)

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 317, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/home/jcgarza/.local/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1214, in _fi

-----------AMAZON SVM-----------
Best Parameters Amazon:  {'C': 1, 'dual': False, 'loss': 'squared_hinge', 'max_iter': 500, 'penalty': 'l1'}
Best Score:  0.8967572126577895
