In [13]:
import pandas as pd
import os

review_data = pd.DataFrame()

for i in range(1, 10):
    file_path = os.path.join('review', f'steam_game_reviews_part_{i}.csv')

    if os.path.exists(file_path):
        print(f'File found: {os.path.abspath(file_path)}')
        part_data = pd.read_csv(file_path)
        review_data = pd.concat([review_data, part_data], ignore_index=True)
    else:
        print(f'File not found: {os.path.abspath(file_path)}')

if review_data.empty:
    print("Error: No game data loaded.")
else:
    print(f"Successfully loaded {len(review_data)} rows.")

File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_1.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_2.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_3.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_4.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_5.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_6.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_7.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_8.csv
File found: D:\data-mining-hcmiu\review\steam_game_reviews_part_9.csv
Successfully loaded 322003 rows.


In [14]:
review_data.head()

Unnamed: 0,app_id,recommendationid,review,voted_up,timestamp_created,playtime_forever,weighted_vote_score,votes_up,steamid
0,2686990,152851098,This game right here people... THIS LITTLE AM...,True,1701667101,589.0,0.345303,4,76561198053422627
1,2686990,163997328,Recommended.,True,1714578984,93.0,0.5,0,76561198104503662
2,2686980,180252847,If you like Watermelon Game but are into space...,True,1732770815,89.0,0.5,0,76561198066151919
3,2687000,185616026,i peed myself playing this. 10/10. fairly good...,True,1736903148,13.0,0.5,0,76561199411862329
4,2686950,187429815,Super fun platformer! Really funny writing too!,True,1738982393,7.0,0.52381,4,76561198088325844


In [15]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322003 entries, 0 to 322002
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   app_id               322003 non-null  int64  
 1   recommendationid     322003 non-null  int64  
 2   review               322003 non-null  object 
 3   voted_up             322003 non-null  bool   
 4   timestamp_created    322003 non-null  int64  
 5   playtime_forever     322003 non-null  float64
 6   weighted_vote_score  322003 non-null  float64
 7   votes_up             322003 non-null  int64  
 8   steamid              322003 non-null  int64  
dtypes: bool(1), float64(2), int64(5), object(1)
memory usage: 20.0+ MB


In [16]:
review_data.isnull().sum()

app_id                 0
recommendationid       0
review                 0
voted_up               0
timestamp_created      0
playtime_forever       0
weighted_vote_score    0
votes_up               0
steamid                0
dtype: int64

In [17]:
import re
import emoji
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove emojis (using regex instead of iteration)
    text = emoji.replace_emoji(text, replace='')  # Removes all emojis
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

review_data['review'] = review_data['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to C:\Users\Khoi
[nltk_data]     Dang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
review_data['review'].head()

0    game right people little amazing gem exactly r...
1                                          recommended
2                   like watermelon game space amazing
3             peed playing fairly good schizo game tbh
4            super fun platformer really funny writing
Name: review, dtype: object

In [19]:
review_data['voted_up'].head()

0    True
1    True
2    True
3    True
4    True
Name: voted_up, dtype: bool

In [20]:
review_data['steamid'].value_counts()

steamid
76561198030784015    515
76561198409134312    499
76561198019816374    352
76561198158993897    325
76561197970193418    281
                    ... 
76561198009430786      1
76561197972585129      1
76561197990809427      1
76561198268398344      1
76561198451685253      1
Name: count, Length: 244264, dtype: int64

In [21]:
mapping_voted_up={
    True: 1,
    False: 0
}
review_data['voted_up'] = review_data['voted_up'].map(mapping_voted_up)
review_data['voted_up'].head()

0    1
1    1
2    1
3    1
4    1
Name: voted_up, dtype: int64

In [25]:
review_data.head()

Unnamed: 0,app_id,recommendationid,review,voted_up,timestamp_created,playtime_forever,weighted_vote_score,votes_up,steamid
0,2686990,152851098,game right people little amazing gem exactly r...,1,1701667101,589.0,0.345303,4,76561198053422627
1,2686990,163997328,recommended,1,1714578984,93.0,0.5,0,76561198104503662
2,2686980,180252847,like watermelon game space amazing,1,1732770815,89.0,0.5,0,76561198066151919
3,2687000,185616026,peed playing fairly good schizo game tbh,1,1736903148,13.0,0.5,0,76561199411862329
4,2686950,187429815,super fun platformer really funny writing,1,1738982393,7.0,0.52381,4,76561198088325844


In [27]:
review_data.drop(columns=['timestamp_created','weighted_vote_score','votes_up'], inplace=True)
review_data.head()

Unnamed: 0,app_id,recommendationid,review,voted_up,playtime_forever,steamid
0,2686990,152851098,game right people little amazing gem exactly r...,1,589.0,76561198053422627
1,2686990,163997328,recommended,1,93.0,76561198104503662
2,2686980,180252847,like watermelon game space amazing,1,89.0,76561198066151919
3,2687000,185616026,peed playing fairly good schizo game tbh,1,13.0,76561199411862329
4,2686950,187429815,super fun platformer really funny writing,1,7.0,76561198088325844


In [29]:
import numpy as np

# Split the DataFrame into 5 parts
split_data = np.array_split(review_data, 5)

# Save each part to a separate CSV file
for i, part in enumerate(split_data, start=1):
    part.to_csv(f'review/cleaned_review/steam_game_review_encoded_part_{i}.csv', index=False)

  return bound(*args, **kwds)
