# Data Preprocessing for Recommender Systems

In [7]:
import pandas as pd
import numpy as np

### Movies

#### Cleaning of rows with NA

In [8]:
movies_df = pd.read_csv('datasets/movies/movies.csv')
movies_df = movies_df.dropna(subset=['title', 'genres'], how='all')
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))

ratings_df = pd.read_csv('datasets/movies/ratings.csv')
ratings_df = ratings_df.dropna(subset=['movieId', 'rating'], how='all')

print('Movies Dataset:')
display(movies_df.head())

print('Ratings Dataset:')
display(ratings_df.head())

Movies Dataset:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


Ratings Dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


#### Giving threshold for rating to count as recommendation

In [9]:
ratingThreshold = 4.5

ratings_count_before = len(ratings_df)
ratings_df = ratings_df[ratings_df['rating'] >= ratingThreshold]

print(f'Original number of reviews: {ratings_count_before}')
print(f'Number of reviews after filtering: {len(ratings_df)}')
print('Filtered Ratings Dataset:')
display(ratings_df.head())

Original number of reviews: 25000095
Number of reviews after filtering: 5813013
Filtered Ratings Dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
8,1,1237,5.0,1147868839
16,1,2351,4.5,1147877957


#### Filtering out users that have review count in provided range

In [10]:
lower_bound = 15
upper_bound = 100
user_review_counts = ratings_df['userId'].value_counts()
filtered_users = user_review_counts[(user_review_counts >= lower_bound) & (user_review_counts <= upper_bound)].index
ratings_df_filtered = ratings_df[ratings_df['userId'].isin(filtered_users)]

print(f'Original number of users: {len(user_review_counts)}')
print(f'Number of users after filtering: {len(filtered_users)}')
print('Filtered Ratings Dataset:')
display(ratings_df_filtered.head())

Original number of users: 160477
Number of users after filtering: 85154
Filtered Ratings Dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
8,1,1237,5.0,1147868839
16,1,2351,4.5,1147877957


### Steam

#### Cleaning of rows with NA

In [11]:
reviews_df_steam = pd.read_json('datasets/steam/formatted_user_reviews.json')
reviews_df_steam['app_id']= reviews_df_steam['app_id'].astype(int)

items_df_steam = pd.read_json('datasets/steam/formatted_steam_games.json')
items_df_steam = items_df_steam.dropna(subset=['app_id'])
items_df_steam['app_id'] = items_df_steam['app_id'].astype(int)
print("Steam datasets loaded successfully.")

# Remove games with no title and genres
items_df_steam_cleaned = items_df_steam.dropna(subset=['title', 'genres'], how='all')

# Remove reviews for the removed games
reviews_df_steam_cleaned = reviews_df_steam[reviews_df_steam['app_id'].isin(items_df_steam_cleaned["app_id"].tolist())]

print("Cleaned Reviews Dataset:")
display(reviews_df_steam_cleaned.head())
print("Cleaned Items Dataset:")
display(items_df_steam_cleaned.head())

Steam datasets loaded successfully.
Cleaned Reviews Dataset:


Unnamed: 0,user_id,app_id
0,76561197970982479,1250
1,76561197970982479,22200
4,js41637,227300
5,js41637,239030
6,evcentric,248820


Cleaned Items Dataset:


Unnamed: 0,app_id,title,genres
0,761140,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]"
1,643980,Ironbound,"[Free to Play, Indie, RPG, Strategy]"
2,670290,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]"
3,767400,弹炸人2222,"[Action, Adventure, Casual]"
5,772540,Battle Royale Trainer,"[Action, Adventure, Simulation]"


#### Filtering out users that have review count in provided range

In [12]:
lower_bound = 5
upper_bound = 100
user_review_counts = reviews_df_steam_cleaned['user_id'].value_counts()
filtered_users = user_review_counts[(user_review_counts >= lower_bound) & (user_review_counts <= upper_bound)].index
reviews_df_steam_filtered = reviews_df_steam_cleaned[reviews_df_steam_cleaned['user_id'].isin(filtered_users)]

print(f'Original number of users: {len(user_review_counts)}')
print(f'Number of users after filtering: {len(filtered_users)}')
print('Filtered Steam Reviews Dataset:')
display(reviews_df_steam_filtered.head())

Original number of users: 22077
Number of users after filtering: 2067
Filtered Steam Reviews Dataset:


Unnamed: 0,user_id,app_id
6,evcentric,248820
7,evcentric,370360
8,evcentric,237930
9,evcentric,263360
10,evcentric,107200
