In [1]:
import pandas as pd


In [3]:
steam = pd.read_csv("Steam_Reviews.csv")
steam

Unnamed: 0,Game,Review,Recommended,Genre
0,Company of Heroes 3,I still believe CoH2 is the superior choice cu...,False,"Action, Strategy"
1,Company of Heroes 3,I Pre-ordered the premium edition version of t...,False,"Action, Strategy"
2,Company of Heroes 3,"An underwhelming, expensive cash grab. Very lo...",False,"Action, Strategy"
3,Company of Heroes 3,Here is my actual review. COH3 has the best Qo...,True,"Action, Strategy"
4,Company of Heroes 3,They listed my GPU as supported and then quiet...,False,"Action, Strategy"
...,...,...,...,...
40644,Baldur's Gate 3,It's not just a game! It's a whole story that ...,True,"Adventure, RPG, Strategy"
40645,Baldur's Gate 3,It's the closest you're going to get to playin...,True,"Adventure, RPG, Strategy"
40646,Baldur's Gate 3,verry gooood gammeeee. I slept on it for to lo...,True,"Adventure, RPG, Strategy"
40647,Baldur's Gate 3,Funny wild magic sorceror Durge run made every...,True,"Adventure, RPG, Strategy"


In [4]:
print("There are", steam["Game"].nunique(), "games and a total of", steam.__len__(),"reviews")

There are 97 games and a total of 40649 reviews


# Text Pre Processing

In [5]:
import os
import re
import csv
import sys
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
import spacy
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm

from scipy.stats import pearsonr

# Function to download NLTK resources
def download_nltk_resources():
    required_resources = ['wordnet', 'stopwords', 'punkt']
    for resource in required_resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
        except LookupError:
            nltk.download(resource)

download_nltk_resources()

# Function to install and load spaCy model
def install_spacy_model(model_name):
    try:
        return spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model: {model_name}")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        return spacy.load(model_name)

sp = install_spacy_model('en_core_web_sm')

# Enable tqdm for pandas
tqdm.pandas()

# Initialize stemmers and lemmatizer
porter = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))


[nltk_data] Downloading package wordnet to /Users/gergoar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


 Module for text preprocessing

In [6]:
def preprocess_lower(text):
    """
    Preprocess text by:
       - Converting to lowercase.
       - Removing punctuation.
       - Tokenizing.
       - Removing stopwords.
    
    Returns:
        str: A string of filtered tokens separated by spaces.
    """
    text_lower = text.lower()
    text_no_punct = re.sub(r'[^\w\s]', '', text_lower)
    tokens = word_tokenize(text_no_punct)
    filtered_tokens = [token for token in tokens if token not in STOP_WORDS]
    return " ".join(filtered_tokens)

def preprocess_stem(text):
    """
    Preprocess text by performing all steps in preprocess_lower() and then applying stemming.
    
    Returns:
        str: A string of stemmed tokens separated by spaces.
    """
    tokens = preprocess_lower(text).split()
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)


# Function to remove emojis and special characters
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove HTML-like tags [b], [i], etc.
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keeps letters, numbers, and spaces
    return text

def preprocess_lemma(text):
    """
    Preprocess text by:
       - Removing punctuation and stopwords using spaCy's token attributes.
       - Lemmatizing the text.
       - (Note: This function does NOT lowercase the text.)
    
    Returns:
        str: A string of lemmatized tokens separated by spaces.
    """
    doc = sp(text)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_.strip() != '']
    return " ".join(lemmatized_tokens)



def tokenize(text, mode=0):
    """
    General tokenize function. Always applies punctuation and stopword removal and then:
    
      mode = 0: Applies lowercasing.
      mode = 1: Applies lowercasing and stemming.
      mode = 2: Applies lemmatization (without lowercasing the original text).
    
    Args:
        text (str): The input text to be processed.
        mode (int): Processing mode (0 for lowercasing; 1 for stemming; 2 for lemmatizing).

    Returns:
        str: A string of processed tokens separated by spaces.

    Raises:
        ValueError: If an invalid mode is provided.
    """
    if mode == 0:
        return preprocess_lower(text)
    elif mode == 1:
        return preprocess_stem(text)
    elif mode == 2:
        return preprocess_lemma(text)
    else:
        raise ValueError("Invalid mode. Please use 0 for lowercasing, 1 for stemming, or 2 for lemmatizing.")

In [8]:
#Getting the clean text column
mod=2 #Lemmatizing

# Pre-process the text column with progress tracking
try:
    steam["cleaned_review"] = steam["Review"].progress_apply(lambda x: tokenize(str(x), mod))
    print("Done processing text.")
except Exception as e:
    print(f"Error processing text column: {e}")
    sys.exit(1)


100%|██████████| 40649/40649 [07:28<00:00, 90.62it/s] 

Done processing text.





Now that we have the lemmatized data we will split it into positive and negative reviews.

In [61]:
#Separating positive and negative reviews
steam_positive= steam[steam["Recommended"]== True]
steam_negative= steam[steam["Recommended"]== False]

# Aggregate liked and disliked reviews per game
game_reviews_positive = steam_positive.groupby("Game")["cleaned_review"].apply(lambda x: " ".join(x)).reset_index()
game_reviews_negative = steam_negative.groupby("Game")["cleaned_review"].apply(lambda x: " ".join(x)).reset_index()

# Rename columns
game_reviews_positive.columns = ["Game", "Positive_Reviews"]
game_reviews_negative.columns = ["Game", "Negative_Reviews"]

# Merge both into a single DataFrame
game_reviews = pd.merge(game_reviews_positive, game_reviews_negative, on="Game", how="outer")

# Fill NaN values with empty strings (some games might not have both positive & negative reviews)
game_reviews = game_reviews.fillna("")

In [62]:
# Drop row with index 14 because its in another language
game_reviews = game_reviews.drop(index=14)

# Reset index after dropping the row
game_reviews = game_reviews.reset_index(drop=True)

print("Row 14 has been removed successfully!")

# Drop row with index 0 because the review is empty
game_reviews = game_reviews.drop(index=0)

# Reset index after dropping the row
game_reviews = game_reviews.reset_index(drop=True)

print("Row 0 has been removed successfully!")



Row 14 has been removed successfully!
Row 0 has been removed successfully!


In [63]:
game_reviews

Unnamed: 0,Game,Positive_Reviews,Negative_Reviews
0,100% Orange Juice,want mario party combat mechanic core audience...,play Chutes Ladders instead lot short annoying...
1,ARK: Survival Ascended,7.5/10 year brother buy ARK confusion play lea...,Wildcard game run Giant patch require 2x game ...
2,ARK: Survival Evolved,game shed tear beauty countless year memory pl...,game make want die play different device year ...
3,Age of Empires II: Definitive Edition,play 2000 hour definite edition come write rec...,game play internet connection single player mo...
4,Age of Empires IV: Anniversary Edition,age Empires IV fantastic blend nostalgia moder...,fun way fail capture magic Age Empires II rock...
...,...,...,...
90,Valheim,perfect perfect minute detail honestly grindy ...,game great mistland rock get 0 vision betcha m...
91,"Warhammer 40,000: Space Marine 2",play complete co op + campaign fully support c...,wonderful game worthy successor Space Marine 1...
92,Windblown,person enjoy roguelike personally game fun com...,game okay time review content replay value 25 ...
93,X-Morph: Defense,play co op great 2 player tower defense shoot ...,Game launch screen set main and/or default scr...


As we realized some of the reviews have emojis and special characters that have not being cleaned, we will further process this more by taking them away.

In [64]:
# Function to remove emojis and special characters
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove HTML-like tags [b], [i], etc.
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keeps letters, numbers, and spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation (keep words and spaces)
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines (\n\n, \n\n\n) with a space
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space

    return text

In [65]:
# Apply cleaning function to both Positive and Negative reviews
game_reviews["Positive_Reviews"] = game_reviews["Positive_Reviews"].astype(str).apply(clean_text)
game_reviews["Negative_Reviews"] = game_reviews["Negative_Reviews"].astype(str).apply(clean_text)

print("Done cleaning all reviews!")


Done cleaning all reviews!


We will now add the genres to our data set to have it complete

In [67]:
# Keep only unique games and their genres from the steam dataset
unique_genres = steam[["Game", "Genre"]].drop_duplicates()

# Merge with game_reviews without creating duplicates
game_reviews = game_reviews.merge(unique_genres, on="Game", how="left")

print("Genres added")

game_reviews


Genres added


Unnamed: 0,Game,Positive_Reviews,Negative_Reviews,Genre
0,100% Orange Juice,want mario party combat mechanic core audience...,play Chutes Ladders instead lot short annoying...,"Indie, Strategy"
1,ARK: Survival Ascended,7510 year brother buy ARK confusion play lead ...,Wildcard game run Giant patch require 2x game ...,"Action, Adventure, Indie, Massively Multiplaye..."
2,ARK: Survival Evolved,game shed tear beauty countless year memory pl...,game make want die play different device year ...,"Action, Adventure, Indie, Massively Multiplaye..."
3,Age of Empires II: Definitive Edition,play 2000 hour definite edition come write rec...,game play internet connection single player mo...,Strategy
4,Age of Empires IV: Anniversary Edition,age Empires IV fantastic blend nostalgia moder...,fun way fail capture magic Age Empires II rock...,Strategy
...,...,...,...,...
90,Valheim,perfect perfect minute detail honestly grindy ...,game great mistland rock get 0 vision betcha m...,"Action, Adventure, Indie, RPG, Early Access"
91,"Warhammer 40,000: Space Marine 2",play complete co op campaign fully support co ...,wonderful game worthy successor Space Marine 1...,"Action, Adventure, RPG"
92,Windblown,person enjoy roguelike personally game fun com...,game okay time review content replay value 25 ...,"Action, Indie, Early Access"
93,X-Morph: Defense,play co op great 2 player tower defense shoot ...,Game launch screen set main andor default scre...,"Action, Indie, Strategy"


We now perform One Hot Encoding for using the genres in our variables.

In [68]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split the genres into separate values
game_reviews['Genre'] = game_reviews['Genre'].apply(lambda x: x.split(', '))

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Perform one-hot encoding
genre_encoded = mlb.fit_transform(game_reviews['Genre'])

# Create a DataFrame with the one-hot encoded genres
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Concatenate the original DataFrame with the one-hot encoded genres
game_reviews = pd.concat([game_reviews, genre_encoded_df], axis=1)

print("One-hot encoding completed!")

One-hot encoding completed!


In [60]:
game_reviews

Unnamed: 0,Game,Positive_Reviews,Negative_Reviews,Genre,Action,Adventure,Casual,Early Access,Education,Free To Play,Indie,Massively Multiplayer,RPG,Racing,Simulation,Sports,Strategy,Unknown Genre,Utilities
0,100% Orange Juice,want mario party combat mechanic core audience...,play Chutes Ladders instead lot short annoying...,"[Indie, Strategy]",0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,ARK: Survival Ascended,7510 year brother buy ARK confusion play lead ...,Wildcard game run Giant patch require 2x game ...,"[Action, Adventure, Indie, Massively Multiplay...",1,1,0,1,0,0,1,1,1,0,0,0,0,0,0
2,ARK: Survival Evolved,game shed tear beauty countless year memory pl...,game make want die play different device year ...,"[Action, Adventure, Indie, Massively Multiplay...",1,1,0,0,0,0,1,1,1,0,0,0,0,0,0
3,Age of Empires II: Definitive Edition,play 2000 hour definite edition come write rec...,game play internet connection single player mo...,[Strategy],0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Age of Empires IV: Anniversary Edition,age Empires IV fantastic blend nostalgia moder...,fun way fail capture magic Age Empires II rock...,[Strategy],0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Valheim,perfect perfect minute detail honestly grindy ...,game great mistland rock get 0 vision betcha m...,"[Action, Adventure, Indie, RPG, Early Access]",1,1,0,1,0,0,1,0,1,0,0,0,0,0,0
91,"Warhammer 40,000: Space Marine 2",play complete co op campaign fully support co ...,wonderful game worthy successor Space Marine 1...,"[Action, Adventure, RPG]",1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
92,Windblown,person enjoy roguelike personally game fun com...,game okay time review content replay value 25 ...,"[Action, Indie, Early Access]",1,0,0,1,0,0,1,0,0,0,0,0,0,0,0
93,X-Morph: Defense,play co op great 2 player tower defense shoot ...,Game launch screen set main andor default scre...,"[Action, Indie, Strategy]",1,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [69]:
game_reviews.to_csv("game_reviews.csv", index=False)