## Dataload

In [1]:
from dataloader import dataloader_steam as steam_loader

train_data, dev_data, test_data = steam_loader.load_data()


print(train_data.head())
train_data.shape, dev_data.shape, test_data.shape

                                                text  label
0  Review: Nice coop game, Playtime: 19, Voted Up...      0
1  Review: It's actually pretty good i recommend ...      1
2  Review: This game is glitchy as sh!t. It's fun...      1
3  Review: This game is all I hoped for and more....      1
4  Review: Fun discoveries and challenge at every...      1


((1168, 2), (146, 2), (147, 2))

## MultinomialNB

In [19]:
from sklearn.pipeline import Pipeline
import os
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

### result

In [21]:
model = Pipeline([
    ('vectorizer',TfidfVectorizer()),  # Step 1: Text data transformation
    ('nb', MultinomialNB())  # Step 2: Classification using Naive Bayes
])

x_train = train_data['text']
y_train = train_data['label']

x_test = test_data['text']
y_test = test_data['label']


model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 0.8908766103059581
Test ROC AUC: 0.7918956043956044

Train Accuracy: 0.916095890410959
Test Accuracy: 0.8231292517006803

Train Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87        91
           1       0.84      0.66      0.74        56

    accuracy                           0.82       147
   macro avg       0.83      0.79      0.80       147
weighted avg       0.83      0.82      0.82       147



### save log

In [23]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"MultinomialNB-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## LinearMVC

In [24]:
model = Pipeline([
    ('vectorizer', TfidfVectorizer()), 
    ('LinearSVC', LinearSVC(random_state=42)) 
])

model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

roc_auc_train = roc_auc_score(y_train, y_pred_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
print("\nTrain Accuracy:", acc_score_train)
print("Test Accuracy:", acc_score_test)
print("\nTrain Classification Report:\n", classification_report(y_test, y_pred_test))


Train ROC AUC: 1.0
Test ROC AUC: 0.7671703296703296

Train Accuracy: 1.0
Test Accuracy: 0.7755102040816326

Train Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82        91
           1       0.69      0.73      0.71        56

    accuracy                           0.78       147
   macro avg       0.76      0.77      0.76       147
weighted avg       0.78      0.78      0.78       147



In [25]:
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

# Define the log file name
log_file_name = f"LinearMVC-steam.log"
log_file_path = os.path.join('logs', log_file_name)

# Save the results to the log file
with open(log_file_path, 'w') as log_file:
    log_file.write(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}\n")
    log_file.write(f"Train ROC AUC: {roc_auc_train}\n")
    log_file.write(f"Test ROC AUC: {roc_auc_test}\n")
    log_file.write(f"Train Accuracy: {acc_score_train}\n")
    log_file.write(f"Test Accuracy: {acc_score_test}\n")
    log_file.write(f"Train Classification Report:\n{classification_report(y_train, y_pred_train)}\n")
    log_file.write(f"Test Classification Report:\n{classification_report(y_test, y_pred_test)}\n")


## Logistic Regression

In [27]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [None]:
# Preprocess the data
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
hashingTF = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Build the logistic regression model
lr = LogisticRegression(labelCol="label")

# Construct the pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

# Train the model
model = pipeline.fit(train_data)

# Predict on the whole dataset
pred_train = model.transform(train_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label")








## 