In [27]:
!pip install gensim #popular, open-source Python library for natural language processing (NLP)



In [28]:
from gensim.models import Word2Vec #Imports the Word2Vec class to train word embedding models.
import numpy as np
import re

In [29]:
from google.colab import files

uploaded = files.upload()  # Opens a file chooser

Saving steam_small.csv to steam_small (2).csv


In [30]:
# ----------------------
# Dataset
# ----------------------

import pandas as pd

# Load your Steam dataset
df = pd.read_csv("steam_small.csv")

# Remove neutral scores
df = df[df["review_score"].isin([1, 2, 4, 5])]

# Drop empty or NaN review_text
df = df.dropna(subset=["review_text"])
df = df[df["review_text"].str.strip() != ""]

# Create binary sentiment
df["sentiment"] = df["review_score"].apply(lambda x: 1 if x >= 4 else 0)

# Extract texts and labels
texts = df["review_text"].astype(str).tolist()
labels = df["sentiment"].tolist()

In [31]:
# ----------------------
# Preprocessing
# ----------------------
def tokenize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

#Converts the input text to a string to avoid errors if itâ€™s not a string.
#Converts all characters to lowercase to ensure uniformity ("Good" and "good" are treated the same).
#Uses re.sub(r"[^a-z\s]", "", text) to remove anything that is not a lowercase letter or space. This removes punctuation, numbers, and special characters.
#Splits the cleaned text into a list of words (tokens)

In [33]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

#Splits the dataset df into training (80%) and testing (20%) sets.
#random_state=42 ensures reproducibility. Same split occurs each run.

In [34]:
corpus = [tokenize(t) for t in train_df["review_text"]]
print(corpus[:5])  # preview first 5 tokenized reviews

#Applies the tokenize function to every review in the training set.
#Creates a list of tokenized reviews (corpus) where each element is a list of words.
#print(corpus[:5]) shows the first 5 tokenized reviews as a quick check.

[['action', 'packed', 'but', 'the', 'combat', 'is', 'very', 'simple', 'just', 'alot', 'of', 'mass', 'clicking', 'very', 'good', 'story', 'and', 'cool', 'weaponsarmors'], ['never', 'get', 'tired', 'of', 'it'], ['cs', 'is', 'good'], ['cant', 'buy', 'skins', 'cases', 'keys', 'stickers', 'gaben', 'cant', 'steal', 'our', 'money', 'would', 'play', 'again'], ['this', 'game', 'is', 'actully', 'pretty', 'good', 'its', 'has', 'an', 'old', 'arcade', 'feeling', 'the', 'gameplay', 'is', 'good', 'the', 'soundtrack', 'for', 'this', 'is', 'awesome', 'and', 'there', 'an', 'zombie', 'mode', 'even', 'tho', 'its', 'doesnt', 'have', 'the', 'best', 'of', 'graphics', 'but', 'its', 'a', 'still', 'pretty', 'good', 'game', 'to', 'enjoy', 'i', 'really', 'recommend', 'this', 'game', 'for', 'people', 'who', 'grew', 'up', 'into', 'arcades']]


In [35]:
# ----------------------
# Train Word2Vec
# ----------------------
model = Word2Vec(corpus, vector_size=50, window=4, min_count=1, sg=1)

#	Trains a Word2Vec model on the tokenized reviews.
# Parameters:
# vector_size=50: Each word will be represented as a 50-dimensional vector.
# window=4: Context window size of 4 words on each side for training word embeddings.
# min_count=1: Include all words appearing at least once (no word is ignored).
# sg=1: Uses skip-gram model (predicts surrounding words from a target word), better for smaller datasets.

In [36]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

#Converts a list of tokens into a single vector representing the sentence.
#Looks up the vector for each word in the trained Word2Vec model.
#If no words are in the model (empty list), returns a zero vector.
#Otherwise, computes the mean of all word vectors to represent the entire sentence.

In [37]:
# X = np.array([sentence_vector(tokenize(text)) for text in texts])
# y = np.array(labels)

def build_xy(dataframe):
    X = []
    y = []

    for _, row in dataframe.iterrows():
        tokens = tokenize(row["review_text"])
        vec = sentence_vector(tokens)

        if vec is not None:
            X.append(vec)
            y.append(row["sentiment"])

    return np.array(X), np.array(y)

X_train, y_train = build_xy(train_df)
X_test, y_test = build_xy(test_df)

#Iterates over each row in the dataframe.
#Tokenizes the review text.
#Converts it into a sentence vector.
#Appends the vector to X and the corresponding label (sentiment) to y.
#Returns X and y as NumPy arrays for machine learning.

In [38]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X_train[y_train == 1], axis=0)
neg_vec = np.mean(X_train[y_train == 0], axis=0)

#Calculates average sentence vector for positive (y=1) and negative (y=0) reviews.
#These act as prototypes for each class.

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [39]:
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

#Computes cosine similarity between two vectors a and b.
#Measures how similar a vector is to another in direction.
#Output is between -1 (opposite) and 1 (identical).

def predict_vector(v):
    return 1 if cosine(v, pos_vec) > cosine(v, neg_vec) else 0

#Compares similarity of a sentence vector v to positive and negative prototypes.
#If closer to positive, predicts 1, else predicts 0.

In [40]:
# ----------------------
# Try it
# ----------------------
predictions = []

for v in X_test:
    predictions.append(predict_vector(v))

#Loops over all test sentence vectors.
#Predicts sentiment for each vector using the prototype comparison.

  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [41]:
# ----------------------
# Accuracy
# ----------------------
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy * 100, "%")

#Compares predicted labels with true labels.
#accuracy_score computes the proportion of correct predictions.
#Prints the accuracy as a percentage.

Accuracy: 100.0 %
