In [45]:
import pandas as pd


In [46]:
steam = pd.read_csv("Steam_Reviews.csv")
steam

Unnamed: 0,Game,Review,Recommended,Genre
0,Company of Heroes 3,I still believe CoH2 is the superior choice cu...,False,"Action, Strategy"
1,Company of Heroes 3,I Pre-ordered the premium edition version of t...,False,"Action, Strategy"
2,Company of Heroes 3,"An underwhelming, expensive cash grab. Very lo...",False,"Action, Strategy"
3,Company of Heroes 3,Here is my actual review. COH3 has the best Qo...,True,"Action, Strategy"
4,Company of Heroes 3,They listed my GPU as supported and then quiet...,False,"Action, Strategy"
...,...,...,...,...
40644,Baldur's Gate 3,It's not just a game! It's a whole story that ...,True,"Adventure, RPG, Strategy"
40645,Baldur's Gate 3,It's the closest you're going to get to playin...,True,"Adventure, RPG, Strategy"
40646,Baldur's Gate 3,verry gooood gammeeee. I slept on it for to lo...,True,"Adventure, RPG, Strategy"
40647,Baldur's Gate 3,Funny wild magic sorceror Durge run made every...,True,"Adventure, RPG, Strategy"


In [47]:
print("There are", steam["Game"].nunique(), "games and a total of", steam.__len__(),"reviews")

There are 97 games and a total of 40649 reviews


# Text Pre Processing

In [None]:
import os
import re
import csv
import sys
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
import spacy
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm

from scipy.stats import pearsonr

# Function to download NLTK resources
def download_nltk_resources():
    required_resources = ['wordnet', 'stopwords', 'punkt']
    for resource in required_resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
        except LookupError:
            nltk.download(resource)

download_nltk_resources()

# Function to install and load spaCy model
def install_spacy_model(model_name):
    try:
        return spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model: {model_name}")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        return spacy.load(model_name)

sp = install_spacy_model('en_core_web_sm')

# Enable tqdm for pandas
tqdm.pandas()

# Initialize stemmers and lemmatizer
porter = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))


[nltk_data] Downloading package wordnet to /Users/gergoar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
"""
This module provides helper functions for text preprocessing. 
Each function applies punctuation removal and stopword removal, and then one of three options:
    0: Lowercasing only.
    1: Lowercasing plus stemming.
    2: Lemmatizing (using spaCy; original casing is preserved).

The functions return a string of tokens separated by spaces.
"""


def preprocess_lower(text):
    """
    Preprocess text by:
       - Converting to lowercase.
       - Removing punctuation.
       - Tokenizing.
       - Removing stopwords.
    
    Returns:
        str: A string of filtered tokens separated by spaces.
    """
    text_lower = text.lower()
    text_no_punct = re.sub(r'[^\w\s]', '', text_lower)
    tokens = word_tokenize(text_no_punct)
    filtered_tokens = [token for token in tokens if token not in STOP_WORDS]
    return " ".join(filtered_tokens)

def preprocess_stem(text):
    """
    Preprocess text by performing all steps in preprocess_lower() and then applying stemming.
    
    Returns:
        str: A string of stemmed tokens separated by spaces.
    """
    tokens = preprocess_lower(text).split()
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)


# Function to remove emojis and special characters
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove HTML-like tags [b], [i], etc.
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keeps letters, numbers, and spaces
    return text

# Updated preprocess_lemma function
def preprocess_lemma(text):
    """
    Preprocess text by:
       - Removing special characters, emojis, and HTML tags.
       - Lemmatizing the text using spaCy.
       - Keeping only alphabetic words (removes numbers & symbols).
    
    Returns:
        str: A string of lemmatized words separated by spaces.
    """
    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")
    text = clean_text(text)  # Remove special characters
    doc = nlp(text)  # Process text with spaCy
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]  # Keep only words
    return " ".join(lemmatized_tokens)



def tokenize(text, mode=0):
    """
    General tokenize function. Always applies punctuation and stopword removal and then:
    
      mode = 0: Applies lowercasing.
      mode = 1: Applies lowercasing and stemming.
      mode = 2: Applies lemmatization (without lowercasing the original text).
    
    Args:
        text (str): The input text to be processed.
        mode (int): Processing mode (0 for lowercasing; 1 for stemming; 2 for lemmatizing).

    Returns:
        str: A string of processed tokens separated by spaces.

    Raises:
        ValueError: If an invalid mode is provided.
    """
    if mode == 0:
        return preprocess_lower(text)
    elif mode == 1:
        return preprocess_stem(text)
    elif mode == 2:
        return preprocess_lemma(text)
    else:
        raise ValueError("Invalid mode. Please use 0 for lowercasing, 1 for stemming, or 2 for lemmatizing.")

In [53]:
# Select a specific review that has special characters
test_review = steam.loc[3076, "Review"]  # Row index we want to test

print("Original Review:")
print(test_review)

# Apply the function to one review
processed_test_review = preprocess_lemma(test_review)

print("\nProcessed Review:")
print(processed_test_review)



Original Review:
🌀 Portal 2 – A Timeless Masterpiece of Puzzle and Wit 🎭
🔹 Developer: Valve
🔹 Genre: Puzzle-Platformer
🔹 Release Date: April 18, 2011
🔹 Platforms: PC, PS3, Xbox 360, macOS, Linux

✨ The Cake May Be a Lie, But the Brilliance is Real!
Portal 2 isn’t just a game—it’s an intellectual playground that challenges your mind, tickles your funny bone, and immerses you in a world where physics, storytelling, and humor collide in perfect harmony. Valve took the innovative mechanics of the original Portal and expanded them into a fully fleshed-out masterpiece that still stands as one of the greatest games ever made.

🧩 Gameplay – A Brain-Bending Work of Art
At its core, Portal 2 is a first-person puzzle-platformer where you use the iconic Portal Gun to place two linked portals on surfaces, allowing for mind-bending traversal and puzzle-solving. But Valve went beyond simple portals:

✅ New Mechanics – Exciting additions like Gels (speed, bounce, and portal-friendly surfaces), Light B

In [50]:
#Getting the clean text column
mod=2 #Lemmatizing

# Pre-process the text column with progress tracking
try:
    steam["cleaned_review"] = steam["Review"].progress_apply(lambda x: tokenize(str(x), mod))
    print("Done processing text.")
except Exception as e:
    print(f"Error processing text column: {e}")
    sys.exit(1)


 34%|███▎      | 13705/40649 [2:01:31<1:48:36,  4.13it/s]    

In [40]:
steam

Unnamed: 0,Game,Review,Recommended,Genre,cleaned_review
0,Company of Heroes 3,I still believe CoH2 is the superior choice cu...,False,"Action, Strategy",believe CoH2 superior choice currently try get...
1,Company of Heroes 3,I Pre-ordered the premium edition version of t...,False,"Action, Strategy",Pre order premium edition version game put hun...
2,Company of Heroes 3,"An underwhelming, expensive cash grab. Very lo...",False,"Action, Strategy",underwhelming expensive cash grab low content ...
3,Company of Heroes 3,Here is my actual review. COH3 has the best Qo...,True,"Action, Strategy",actual review COH3 good qol franchise stop pla...
4,Company of Heroes 3,They listed my GPU as supported and then quiet...,False,"Action, Strategy",list GPU support quietly patch support year re...
...,...,...,...,...,...
40644,Baldur's Gate 3,It's not just a game! It's a whole story that ...,True,"Adventure, RPG, Strategy",game story completely immerse primarily thank ...
40645,Baldur's Gate 3,It's the closest you're going to get to playin...,True,"Adventure, RPG, Strategy",close go play DnD group friend willing sit pla...
40646,Baldur's Gate 3,verry gooood gammeeee. I slept on it for to lo...,True,"Adventure, RPG, Strategy",verry gooood gammeeee sleep long finish ton co...
40647,Baldur's Gate 3,Funny wild magic sorceror Durge run made every...,True,"Adventure, RPG, Strategy",funny wild magic sorceror Durge run turn cat t...


In [41]:
#Separating positive and negative reviews
steam_positive= steam[steam["Recommended"]== True]
steam_negative= steam[steam["Recommended"]== False]

# Aggregate liked and disliked reviews per game
game_reviews_positive = steam_positive.groupby("Game")["cleaned_review"].apply(lambda x: " ".join(x)).reset_index()
game_reviews_negative = steam_negative.groupby("Game")["cleaned_review"].apply(lambda x: " ".join(x)).reset_index()

# Rename columns
game_reviews_positive.columns = ["Game", "Positive_Reviews"]
game_reviews_negative.columns = ["Game", "Negative_Reviews"]

# Merge both into a single DataFrame
game_reviews = pd.merge(game_reviews_positive, game_reviews_negative, on="Game", how="outer")

# Fill NaN values with empty strings (some games might not have both positive & negative reviews)
game_reviews = game_reviews.fillna("")


In [42]:
game_reviews

Unnamed: 0,Game,Positive_Reviews,Negative_Reviews
0,0 Day,love game wait devs love game wait devs love g...,
1,100% Orange Juice,want mario party combat mechanic core audience...,play Chutes Ladders instead lot short annoying...
2,ARK: Survival Ascended,7.5/10 year brother buy ARK confusion play lea...,Wildcard game run Giant patch require 2x game ...
3,ARK: Survival Evolved,game shed tear beauty countless year memory pl...,game make want die play different device year ...
4,Age of Empires II: Definitive Edition,play 2000 hour definite edition come write rec...,game play internet connection single player mo...
...,...,...,...
92,Valheim,perfect perfect minute detail honestly grindy ...,game great mistland rock get 0 vision betcha m...
93,"Warhammer 40,000: Space Marine 2",play complete co op + campaign fully support c...,wonderful game worthy successor Space Marine 1...
94,Windblown,person enjoy roguelike personally game fun com...,game okay time review content replay value 25 ...
95,X-Morph: Defense,play co op great 2 player tower defense shoot ...,Game launch screen set main and/or default scr...
