## Dataload

In [1]:
from dataloader import dataloader_steam as steam_loader

train_data, dev_data, test_data = steam_loader.load_data()


print(train_data.head())
train_data.shape, dev_data.shape, test_data.shape

                                                text  label
0  Review: Nice coop game, Playtime: 19, Voted Up...      0
1  Review: It's actually pretty good i recommend ...      1
2  Review: This game is glitchy as sh!t. It's fun...      1
3  Review: This game is all I hoped for and more....      1
4  Review: Fun discoveries and challenge at every...      1


((1168, 2), (146, 2), (147, 2))

## MultinomialNB

In [2]:
from sklearn.pipeline import Pipeline
import os
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

### result

In [3]:
model = Pipeline([
    ('vectorizer',TfidfVectorizer()),  # Step 1: Text data transformation
    ('nb', MultinomialNB())  # Step 2: Classification using Naive Bayes
])

x_train = train_data['text']
y_train = train_data['label']

x_test = test_data['text']
y_test = test_data['label']


model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 0.8908766103059581
Test ROC AUC: 0.7918956043956044

Train Accuracy: 0.916095890410959
Test Accuracy: 0.8231292517006803

Train Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87        91
           1       0.84      0.66      0.74        56

    accuracy                           0.82       147
   macro avg       0.83      0.79      0.80       147
weighted avg       0.83      0.82      0.82       147



### save log

In [4]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"MultinomialNB-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## LinearMVC

In [5]:
model = Pipeline([
    ('vectorizer', TfidfVectorizer()), 
    ('LinearSVC', LinearSVC(random_state=42)) 
])

model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 1.0
Test ROC AUC: 0.7671703296703296

Train Accuracy: 1.0
Test Accuracy: 0.7755102040816326

Train Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82        91
           1       0.69      0.73      0.71        56

    accuracy                           0.78       147
   macro avg       0.76      0.77      0.76       147
weighted avg       0.78      0.78      0.78       147



In [6]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"LinearMVC-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## Logistic Regression

In [None]:
import torch
from model.logistic_regression_model import LogisticRegressionModel
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'default_hp_search_backend' from 'transformers.integrations' (d:\Anaconda\envs\DASC7606\lib\site-packages\transformers\integrations\__init__.py)

In [None]:
train_text_list = train_data['text'].tolist()
dev_text_list = dev_data['text'].tolist()
test_text_list = test_data['text'].tolist()

train_text_embeddings = get_texts_embedding(train_text_list)
dev_text_embeddings = get_texts_embedding(dev_text_list)
test_text_embeddings = get_texts_embedding(test_text_list)

train_text_embeddings = torch.tensor(train_text_embeddings)
dev_text_embeddings = torch.tensor(dev_text_embeddings)
test_text_embeddings = torch.tensor(test_text_embeddings)

print(train_text_embeddings.shape)
print(dev_text_embeddings.shape)
print(test_text_embeddings.shape)

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'is_torchvision_v2_available' from 'transformers.utils.import_utils' (d:\Anaconda\envs\DASC7606\lib\site-packages\transformers\utils\import_utils.py)

In [None]:
train_y_true = torch.tensor(y_train.values).float().unsqueeze(1)
dev_y_true = torch.tensor(dev_data['label'].values).float().unsqueeze(1)
test_y_true = torch.tensor(test_data['label'].values).float().unsqueeze(1)

train_dataloader = TensorDataset(train_text_embeddings, train_y_true)
train_dataloader = DataLoader(train_dataloader, batch_size=32, shuffle=True)
dev_dataloader = TensorDataset(dev_text_embeddings, dev_y_true)
dev_dataloader = DataLoader(dev_dataloader, batch_size=32, shuffle=False)
test_dataloader = TensorDataset(test_text_embeddings, test_y_true)
test_dataloader = DataLoader(test_dataloader, batch_size=32, shuffle=False)



In [None]:
model = LogisticRegressionModel(384)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Eval:::Epoch [1/3000], Loss: 0.6489, Dev Accuracy: 0.6507, Dev F1 Score: 0.0377
Eval:::Epoch [2/3000], Loss: 0.6178, Dev Accuracy: 0.6575, Dev F1 Score: 0.0741
Eval:::Epoch [3/3000], Loss: 0.5642, Dev Accuracy: 0.6712, Dev F1 Score: 0.1724
Eval:::Epoch [4/3000], Loss: 0.6113, Dev Accuracy: 0.6781, Dev F1 Score: 0.2034
Eval:::Epoch [5/3000], Loss: 0.5733, Dev Accuracy: 0.6918, Dev F1 Score: 0.2623
Eval:::Epoch [6/3000], Loss: 0.4711, Dev Accuracy: 0.7260, Dev F1 Score: 0.4118
Eval:::Epoch [7/3000], Loss: 0.5796, Dev Accuracy: 0.7260, Dev F1 Score: 0.4286
Eval:::Epoch [8/3000], Loss: 0.5081, Dev Accuracy: 0.7260, Dev F1 Score: 0.4444
Eval:::Epoch [9/3000], Loss: 0.4653, Dev Accuracy: 0.7329, Dev F1 Score: 0.4800
Eval:::Epoch [10/3000], Loss: 0.3912, Dev Accuracy: 0.7397, Dev F1 Score: 0.5128
Eval:::Epoch [11/3000], Loss: 0.5104, Dev Accuracy: 0.7466, Dev F1 Score: 0.5432
Eval:::Epoch [12/3000], Loss: 0.3475, Dev Accuracy: 0.7534, Dev F1 Score: 0.5714
Eval:::Epoch [13/3000], Loss: 0.4656,

## 