In [2]:
import steamreviews
import pandas as pd

request_params = dict()
# Reference: https://partner.steamgames.com/doc/store/localization#supported_languages
request_params['language'] = 'english'
request_params['filter'] = 'recent'
request_params['day_range'] = '28'
# Reference: https://partner.steamgames.com/doc/store/getreviews

appids = pd.read_csv('idlist.txt')

In [2]:
steamreviews.download_reviews_for_app_id_batch(chosen_request_params=request_params)

Loading idlist.txt
Loading idprocessed_on_20250131.txt
Skipping previously found appID = None
Skipping previously found appID = 10
Skipping previously found appID = 20
Skipping previously found appID = 40
Skipping previously found appID = 50
Skipping previously found appID = 60
Skipping previously found appID = 70
Skipping previously found appID = 80
Skipping previously found appID = 130
Skipping previously found appID = 220
Skipping previously found appID = 240
Skipping previously found appID = 280
Skipping previously found appID = 300
Skipping previously found appID = 320
Skipping previously found appID = 340
Skipping previously found appID = 360
Skipping previously found appID = 380
Skipping previously found appID = 400
Skipping previously found appID = 420
Skipping previously found appID = 440
Skipping previously found appID = 500
Skipping previously found appID = 550
Skipping previously found appID = 570
Skipping previously found appID = 620
Skipping previously found appID = 630
S

True

In [3]:
import pandas as pd
import json
import os
from tqdm import tqdm  # For progress bar

def process_json_files(data_dir="Data", output_file="all_reviews.csv"):
    # Initialize empty list to store DataFrames
    all_dfs = []
    
    # Get list of JSON files
    files = [f for f in os.listdir(data_dir) if f.startswith("review_") and f.endswith(".json")]
    
    for filename in tqdm(files, desc="Processing JSON files"):
        try:
            # Extract game ID from filename
            game_id = filename.split("_")[1].split(".")[0]
            
            # Load JSON data
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Skip files without reviews
            if 'reviews' not in data or not data['reviews']:
                continue
                
            # Convert reviews to DataFrame
            reviews_list = list(data['reviews'].values())
            df = pd.json_normalize(reviews_list, sep='_')
            
            # Add game ID column
            df['game_id'] = game_id
            
            # Append to list
            all_dfs.append(df)
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    # Combine all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # Convert numeric columns
    numeric_cols = [
        'author_num_games_owned', 'author_num_reviews',
        'author_playtime_forever', 'author_playtime_last_two_weeks',
        'author_playtime_at_review', 'votes_up', 'votes_funny',
        'weighted_vote_score', 'comment_count'
    ]
    combined_df[numeric_cols] = combined_df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns
    bool_cols = [
        'voted_up', 'steam_purchase', 'received_for_free',
        'written_during_early_access', 'primarily_steam_deck'
    ]
    combined_df[bool_cols] = combined_df[bool_cols].astype(bool)
    
    # Convert timestamp columns
    time_cols = ['timestamp_created', 'timestamp_updated', 'author_last_played']
    for col in time_cols:
        combined_df[col] = pd.to_datetime(combined_df[col], unit='s', errors='coerce')
    
    # Save to CSV
    combined_df.to_csv(output_file, index=False)
    print(f"Saved {len(combined_df)} reviews to {output_file}")

if __name__ == "__main__":
    process_json_files()

Processing JSON files: 100%|██████████| 1005/1005 [00:19<00:00, 52.37it/s]


Saved 674334 reviews to all_reviews.csv


In [1]:
import pandas as pd
import numpy as np
import contractions

In [2]:
df = pd.read_csv("all_reviews.csv")
df.head()

  df = pd.read_csv("all_reviews.csv")


Unnamed: 0,recommendationid,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,...,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,author_deck_playtime_at_review,game_id,timestamp_dev_responded,developer_response
0,186822035,russian,WWW\r\n,2025-01-31 13:33:57,2025-01-31 13:33:57,True,0,0,0.0,0,...,0,3,550,550,550.0,2025-01-31 13:33:49,,10,,
1,186820701,english,ts crazy af so worth it,2025-01-31 13:12:02,2025-01-31 13:12:02,True,0,0,0.0,0,...,47,17,112,112,112.0,2025-01-31 13:03:28,,10,,
2,186820524,english,best game ever,2025-01-31 13:09:11,2025-01-31 13:09:11,True,0,0,0.0,0,...,0,1,2730,985,2707.0,2025-01-31 14:09:25,,10,,
3,186818112,russian,хуйня для динозавров заходишь на сервер а там ...,2025-01-31 12:28:09,2025-01-31 12:28:09,False,0,0,0.0,0,...,28,6,23,0,23.0,2023-04-21 12:16:58,,10,,
4,186818009,english,A Classic LAN-game. Childhood memories.\r\nNow...,2025-01-31 12:26:07,2025-01-31 12:26:07,True,0,0,0.0,0,...,0,16,6683,0,6683.0,2024-10-18 21:37:25,,10,,


In [3]:
df_en = df[df['language'] == 'english']
df_en.shape

(468002, 25)

In [4]:
df['timestamp_created'].astype('datetime64[ns]')

0        2025-01-31 13:33:57
1        2025-01-31 13:12:02
2        2025-01-31 13:09:11
3        2025-01-31 12:28:09
4        2025-01-31 12:26:07
                 ...        
674329   2025-01-07 04:48:54
674330   2025-01-05 22:12:35
674331   2025-01-05 21:10:16
674332   2025-01-05 18:13:17
674333   2025-01-04 20:31:57
Name: timestamp_created, Length: 674334, dtype: datetime64[ns]

In [5]:
df_en = df_en[df_en['timestamp_created'].astype('datetime64[ns]') > pd.to_datetime('2025-01-01')]
df_en.shape

(385423, 25)

In [6]:
df_en.rename(columns={'recommendationid': 'id',
                      'voted_up': 'score',
                      'review': 'content',
                      'author_steamid': 'author_id',
                      'game_id': 'app_id'}, inplace=True)
df_en['score'] = df_en['score'].astype(int)
df_en

Unnamed: 0,id,language,content,timestamp_created,timestamp_updated,score,votes_up,votes_funny,weighted_vote_score,comment_count,...,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,author_deck_playtime_at_review,app_id,timestamp_dev_responded,developer_response
1,186820701,english,ts crazy af so worth it,2025-01-31 13:12:02,2025-01-31 13:12:02,1,0,0,0.000000,0,...,47,17,112,112,112.0,2025-01-31 13:03:28,,10,,
2,186820524,english,best game ever,2025-01-31 13:09:11,2025-01-31 13:09:11,1,0,0,0.000000,0,...,0,1,2730,985,2707.0,2025-01-31 14:09:25,,10,,
4,186818009,english,A Classic LAN-game. Childhood memories.\r\nNow...,2025-01-31 12:26:07,2025-01-31 12:26:07,1,0,0,0.000000,0,...,0,16,6683,0,6683.0,2024-10-18 21:37:25,,10,,
5,186816922,english,Classic,2025-01-31 12:06:19,2025-01-31 12:06:19,1,0,0,0.000000,0,...,0,4,3206,98,3206.0,2025-01-31 12:06:06,,10,,
11,186799829,english,goat,2025-01-31 05:52:42,2025-01-31 05:52:42,1,0,0,0.000000,0,...,0,4,2999,22,2976.0,2025-01-31 06:18:01,,10,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674329,184991220,english,Gone but not forgotten...,2025-01-07 04:48:54,2025-01-07 04:48:54,1,0,0,0.000000,0,...,0,20,1646,0,1646.0,2025-01-07 04:48:13,,99900,,
674330,184877197,english,"pretty good game tbh, even tho i only played f...",2025-01-05 22:12:35,2025-01-05 22:12:35,1,0,0,0.000000,0,...,0,2,67,0,67.0,2025-01-05 22:09:57,,99900,,
674331,184872577,english,"piece of shit game, can't even be bothered to ...",2025-01-05 21:10:16,2025-01-05 21:10:16,0,4,1,0.445714,0,...,307,6,75,0,75.0,2025-01-05 21:09:34,,99900,,
674332,184857864,english,controls are weird,2025-01-05 18:13:17,2025-01-05 18:13:17,0,1,0,0.000000,0,...,0,1,9,0,9.0,2025-01-05 18:12:41,,99900,,


In [7]:
validation_set = df_en[['id','app_id', 'author_id', 'content', 'score']]
validation_set['content'].astype(str)
validation_set['content'].dropna()
validation_set = validation_set[validation_set['content'].str.len() >= 10]
validation_set

Unnamed: 0,id,app_id,author_id,content,score
1,186820701,10,76561198861397461,ts crazy af so worth it,1
2,186820524,10,76561197991630431,best game ever,1
4,186818009,10,76561197993740346,A Classic LAN-game. Childhood memories.\r\nNow...,1
13,186789771,10,76561199508598629,The most friendly community i have ever seen. ...,1
15,186782787,10,76561199675346350,The best version of cs to this day. Nothing wi...,1
...,...,...,...,...,...
674329,184991220,99900,76561198355736866,Gone but not forgotten...,1
674330,184877197,99900,76561199057223759,"pretty good game tbh, even tho i only played f...",1
674331,184872577,99900,76561198122368726,"piece of shit game, can't even be bothered to ...",0
674332,184857864,99900,76561199757360530,controls are weird,0


In [8]:
from langdetect import detect, DetectorFactory, LangDetectException

# Ensuring consistent results from langdetect
DetectorFactory.seed = 42

# Define a function to detect English comments
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False  # Handle empty or unreadable text

# Apply the language detection
validation_set['is_english'] = validation_set['content'].apply(is_english)

KeyboardInterrupt: 

In [10]:
validation_set

Unnamed: 0,id,app_id,author_id,content,score,is_english
1,186820701,10,76561198861397461,ts crazy af so worth it,1,True
2,186820524,10,76561197991630431,best game ever,1,False
4,186818009,10,76561197993740346,A Classic LAN-game. Childhood memories.\r\nNow...,1,True
13,186789771,10,76561199508598629,The most friendly community i have ever seen. ...,1,True
15,186782787,10,76561199675346350,The best version of cs to this day. Nothing wi...,1,True
...,...,...,...,...,...,...
674329,184991220,99900,76561198355736866,Gone but not forgotten...,1,False
674330,184877197,99900,76561199057223759,"pretty good game tbh, even tho i only played f...",1,True
674331,184872577,99900,76561198122368726,"piece of shit game, can't even be bothered to ...",0,True
674332,184857864,99900,76561199757360530,controls are weird,0,True


In [None]:
validation_set = validation_set[validation_set['is_english'] == True].drop(columns=['is_english'])
validation_set.shape

In [10]:
validation_set = pd.read_csv('preprocessed_validationset.csv')

In [11]:
# Text processing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from langdetect import detect, DetectorFactory, LangDetectException
nltk.download('wordnet')   #used in first run to download packages
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def PreProcessing_tokens(text):
    # Lowercasing text
    text = text.lower()
    # Removing numbers and symbols 
    text = re.sub(r'[^a-z\s]', '', text)
    # Using the contractions library to convert contracted text forms to their expanded version
    text = contractions.fix(text)
    # Tokenizing the text by words
    tokens = word_tokenize(text)
    # Applying lemmatization to the words not present in the stopwords set 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Returning the joined preprocessed tokens
    return tokens

def PreProcessing_doc(text):
    tokens = PreProcessing_tokens(text)
    return ' '.join(tokens)

validation_set['cleaned_content'] = validation_set['content'].apply(PreProcessing_doc)
validation_set['tokens'] = validation_set['content'].apply(PreProcessing_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vnvtr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
validation_set = validation_set[validation_set['cleaned_content'] != '']
validation_set.to_csv('preprocessed_validationset.csv', index=False)

In [3]:
import kagglehub
import pandas as pd 
# Reading the dataset from the original source:
path = kagglehub.dataset_download("filipkin/steam-reviews")
# Printing dataset file path
print("Path to dataset files:", path)

games = pd.read_csv(f'{path}/output_steamspy.csv')
games

Path to dataset files: C:\Users\vnvtr\.cache\kagglehub\datasets\filipkin\steam-reviews\versions\6


Unnamed: 0,appid,name,owners
0,10,Counter-Strike,"10,000,000 .. 20,000,000"
1,20,Team Fortress Classic,"5,000,000 .. 10,000,000"
2,40,Deathmatch Classic,"5,000,000 .. 10,000,000"
3,50,Half-Life: Opposing Force,"2,000,000 .. 5,000,000"
4,60,Ricochet,"5,000,000 .. 10,000,000"
...,...,...,...
995,2835570,Buckshot Roulette,"1,000,000 .. 2,000,000"
996,2881650,Content Warning,"5,000,000 .. 10,000,000"
997,3070070,TCG Card Shop Simulator,"1,000,000 .. 2,000,000"
998,3097560,Liar's Bar,"2,000,000 .. 5,000,000"


In [4]:
# Calculating average game owners from the owners range column
def calculate_average_owners(range_str):
    try:
        # Split the range at ".." and strip commas while converting to int
        low, high = range_str.split('..')
        low = int(low.replace(',', '').strip())
        high = int(high.replace(',', '').strip())
        return (low + high) / 2  # Compute the average
    except Exception as e:
        print(f"Error processing range: {range_str} -> {e}")
        return None  # Return None for invalid entries


# Apply the function to the 'owners' column
games['average_owners'] = games['owners'].apply(calculate_average_owners)
# Check if any na arose
games.isnull().sum()

appid             0
name              0
owners            0
average_owners    0
dtype: int64

In [19]:
merge = pd.merge(validation_set,games, right_on='appid', left_on='app_id', how='inner')

In [20]:
merge = merge.drop(['appid', 'owners'], axis=1)
merge

Unnamed: 0,id,app_id,author_id,content,score,cleaned_content,tokens,name,average_owners
0,186820701,10,76561198861397461,ts crazy af so worth it,1,t crazy af worth,"['t', 'crazy', 'af', 'worth']",Counter-Strike,15000000.0
1,186818009,10,76561197993740346,A Classic LAN-game. Childhood memories.\r\nNow...,1,classic langame childhood memory infected peop...,"['classic', 'langame', 'childhood', 'memory', ...",Counter-Strike,15000000.0
2,186789771,10,76561199508598629,The most friendly community i have ever seen. ...,1,friendly community ever seen playing group guy...,"['friendly', 'community', 'ever', 'seen', 'pla...",Counter-Strike,15000000.0
3,186782787,10,76561199675346350,The best version of cs to this day. Nothing wi...,1,best version c day nothing ever beat,"['best', 'version', 'c', 'day', 'nothing', 'ev...",Counter-Strike,15000000.0
4,186781078,10,76561197960432447,A legendary tactical shooter that shaped the g...,1,legendary tactical shooter shaped genre simple...,"['legendary', 'tactical', 'shooter', 'shaped',...",Counter-Strike,15000000.0
...,...,...,...,...,...,...,...,...,...
253808,185072995,99900,76561198954663339,>downloads game\r\n>finishes tutorial\r\n>join...,1,downloads game finish tutorial join friend cur...,"['downloads', 'game', 'finish', 'tutorial', 'j...",Spiral Knights,3500000.0
253809,184877197,99900,76561199057223759,"pretty good game tbh, even tho i only played f...",1,pretty good game tbh even tho played hat proba...,"['pretty', 'good', 'game', 'tbh', 'even', 'tho...",Spiral Knights,3500000.0
253810,184872577,99900,76561198122368726,"piece of shit game, can't even be bothered to ...",0,piece shit game cant even bothered get hat fuc...,"['piece', 'shit', 'game', 'cant', 'even', 'bot...",Spiral Knights,3500000.0
253811,184857864,99900,76561199757360530,controls are weird,0,control weird,"['control', 'weird']",Spiral Knights,3500000.0


In [21]:
merge.to_csv('preprocessed_validationset.csv', index=False)