## Dataload

In [3]:
from dataloader import dataloader_steam as steam_loader

train_data, dev_data, test_data = steam_loader.load_data()


print(train_data.head())
train_data.shape, dev_data.shape, test_data.shape

                                                text  label
0  Review: Nice coop game, Playtime: 19, Voted Up...      0
1  Review: It's actually pretty good i recommend ...      1
2  Review: This game is glitchy as sh!t. It's fun...      1
3  Review: This game is all I hoped for and more....      1
4  Review: Fun discoveries and challenge at every...      1


((1168, 2), (146, 2), (147, 2))

## MultinomialNB

In [4]:
from sklearn.pipeline import Pipeline
import os
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

### result

In [5]:
model = Pipeline([
    ('vectorizer',TfidfVectorizer()),  # Step 1: Text data transformation
    ('nb', MultinomialNB())  # Step 2: Classification using Naive Bayes
])

x_train = train_data['text']
y_train = train_data['label']

x_test = test_data['text']
y_test = test_data['label']


model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 0.8908766103059581
Test ROC AUC: 0.7918956043956044

Train Accuracy: 0.916095890410959
Test Accuracy: 0.8231292517006803

Train Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87        91
           1       0.84      0.66      0.74        56

    accuracy                           0.82       147
   macro avg       0.83      0.79      0.80       147
weighted avg       0.83      0.82      0.82       147



### save log

In [6]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"MultinomialNB-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## LinearMVC

In [7]:
model = Pipeline([
    ('vectorizer', TfidfVectorizer()), 
    ('LinearSVC', LinearSVC(random_state=42)) 
])

model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 1.0
Test ROC AUC: 0.7671703296703296

Train Accuracy: 1.0
Test Accuracy: 0.7755102040816326

Train Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82        91
           1       0.69      0.73      0.71        56

    accuracy                           0.78       147
   macro avg       0.76      0.77      0.76       147
weighted avg       0.78      0.78      0.78       147



In [8]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"LinearMVC-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## Logistic Regression

In [14]:
import torch
from model.logistic_regression_model import LogisticRegressionModel
from module.text_embed import get_texts_embedding
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [21]:
train_text_list = train_data['text'].tolist()
dev_text_list = dev_data['text'].tolist()
test_text_list = test_data['text'].tolist()

train_text_embeddings = get_texts_embedding(train_text_list)
dev_text_embeddings = get_texts_embedding(dev_text_list)
test_text_embeddings = get_texts_embedding(test_text_list)

train_text_embeddings = torch.tensor(train_text_embeddings)
dev_text_embeddings = torch.tensor(dev_text_embeddings)
test_text_embeddings = torch.tensor(test_text_embeddings)

print(train_text_embeddings.shape)
print(dev_text_embeddings.shape)
print(test_text_embeddings.shape)

torch.Size([1168, 384])
torch.Size([146, 384])
torch.Size([147, 384])


In [None]:
model = LogisticRegressionModel(384)

train_y_pred = model(text_embeddings)
train_y_true = torch.tensor(y_train.values).float().unsqueeze(1)



dataset = TensorDataset(text_embeddings, y_true_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = LogisticRegressionModel(384)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:

log_file_name = f"LogisticRegression-steam.log"
log_file_path = os.path.join('logs', log_file_name)
with open(log_file_path, 'w') as log_file:
# Training loop
    num_epochs = 3000
    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            inputs, labels = batch
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # Calculate accuracy and F1 score for training set
        model.eval()
        with torch.no_grad():
            y_pred_train = model(text_embeddings)
            y_pred_train_class = (y_pred_train >= 0.5).float()
            acc_score_train = accuracy_score(y_true_train, y_pred_train_class)
            f1_score_train = f1_score(y_true_train, y_pred_train_class)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {acc_score_train:.4f}, Train F1 Score: {f1_score_train:.4f}')

        log_file.write(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {acc_score_train:.4f}, Train F1 Score: {f1_score_train:.4f}/n')

Epoch [1/3000], Loss: 0.6739, Train Accuracy: 0.6301, Train F1 Score: 0.0000
Epoch [2/3000], Loss: 0.6704, Train Accuracy: 0.6378, Train F1 Score: 0.0408
Epoch [3/3000], Loss: 0.6217, Train Accuracy: 0.6550, Train F1 Score: 0.1258
Epoch [4/3000], Loss: 0.7023, Train Accuracy: 0.6678, Train F1 Score: 0.1917
Epoch [5/3000], Loss: 0.5523, Train Accuracy: 0.6952, Train F1 Score: 0.3074
Epoch [6/3000], Loss: 0.4313, Train Accuracy: 0.7158, Train F1 Score: 0.3897
Epoch [7/3000], Loss: 0.5632, Train Accuracy: 0.7329, Train F1 Score: 0.4545
Epoch [8/3000], Loss: 0.5243, Train Accuracy: 0.7406, Train F1 Score: 0.4873
Epoch [9/3000], Loss: 0.4652, Train Accuracy: 0.7551, Train F1 Score: 0.5446
Epoch [10/3000], Loss: 0.4373, Train Accuracy: 0.7680, Train F1 Score: 0.5863
Epoch [11/3000], Loss: 0.6292, Train Accuracy: 0.7740, Train F1 Score: 0.6024
Epoch [12/3000], Loss: 0.4977, Train Accuracy: 0.7834, Train F1 Score: 0.6285
Epoch [13/3000], Loss: 0.5622, Train Accuracy: 0.7877, Train F1 Score: 0.

## 