## Naive Bayes

#### Bag of Words

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('../../../data/processed/tokenized_data_sample_with_custom_stopwords.csv')
X_train, X_test, y_train, y_test = train_test_split(df['reviewTextTotal'], df['overall'], test_size=0.2, random_state=42)

X_train.fillna('', inplace=True)
X_test.fillna('', inplace=True)

# Initialize and fit the CountVectorizer to transform text data into bag-of-words vectors
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Initialize and train the SVM model
model = SVC(kernel='linear')  
model.fit(X_train_bow, y_train)

# Predict the target variable for test data
y_pred = model.predict(X_test_bow)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         1.0       0.70      0.78      0.74      1236
         2.0       0.50      0.47      0.48       759
         3.0       0.51      0.56      0.54      1082
         4.0       0.59      0.52      0.55      1669
         5.0       0.88      0.89      0.88      5254

    accuracy                           0.74     10000
   macro avg       0.64      0.64      0.64     10000
weighted avg       0.74      0.74      0.74     10000



#### Word2Vec

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../../data/processed/tokenized_data_sample_with_custom_stopwords.csv')

# Load the vectorized features from the .npy file
vectorized_features = np.load('../../../data/processed/sample_vectorized_features_with_custom_stopwords_word2vec.npy')

feature_vectors = pd.DataFrame(vectorized_features)


feature_vectors.columns = [f'vec_feature_{i}' for i in range(feature_vectors.shape[1])]

X = feature_vectors # Feature vector
y = df['overall']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the SVM model
model = SVC(kernel='linear')  
model.fit(X_train, y_train)

# Predict the target variable for test data
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         1.0       0.75      0.35      0.48      1236
         2.0       0.84      0.19      0.31       759
         3.0       0.69      0.25      0.37      1082
         4.0       0.86      0.19      0.31      1669
         5.0       0.61      0.99      0.75      5254

    accuracy                           0.63     10000
   macro avg       0.75      0.39      0.45     10000
weighted avg       0.70      0.63      0.57     10000

