In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [19]:
df = pd.read_csv('../data/Project_merge.csv', low_memory=False)

In [20]:
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
print(non_numeric_cols)

Index(['name', 'release_date', 'developers', 'publishers', 'categories',
       'genres', 'publisherClass'],
      dtype='object')


In [21]:
cols_to_encode = ['developers', 'publishers', 'publisherClass']
# Dictionary to store LabelEncoders
encoders = {}

for col in cols_to_encode:
    # Initialize encoder
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    encoders[col] = le
    

In [23]:
numeric_df = df.select_dtypes(include=['number']).copy()
print(numeric_df.head(10))

   appid  price   positive   negative   peak_ccu  pct_pos_total  \
0     20   4.99     7500.0     1121.0       46.0           86.0   
1    240   9.99   172801.0     6697.0    14426.0           96.0   
2    300   9.99    20604.0     1878.0      285.0           90.0   
3    360   9.99     3492.0     1195.0       55.0           73.0   
4    440   0.00  1025633.0   120619.0    50817.0           89.0   
5    570   0.00  1998462.0   451338.0   555977.0           81.0   
6    730   0.00  7480813.0  1135108.0  1212356.0           86.0   
7   1500   9.99      855.0      214.0        4.0           78.0   
8   1610   1.99      225.0       39.0       19.0           86.0   
9   2280   9.99    22813.0      870.0      234.0           96.0   

   num_reviews_total   copiesSold  reviewScore  developers_encoded  \
0             6482.0     378635.0         87.0               46087   
1           124438.0   15468468.0         96.0               46087   
2            15155.0    1172320.0         92.0      

In [28]:
genre = df['genres'].str.get_dummies(sep=',')
categories = df['categories'].str.get_dummies(sep=',')

numeric_df_final = pd.concat([numeric_df, genre, categories], axis=1)
print(numeric_df_final.shape)

(84536, 197)


In [30]:
X = numeric_df_final.drop(columns=['appid', 'copiesSold']).copy()
y = numeric_df_final['copiesSold'].copy()

In [31]:
correlations = X.corrwith(y).abs().sort_values(ascending=False)
X = X[correlations.index]
X.head()

Unnamed: 0,num_reviews_total,positive,negative,peak_ccu,'Valve Anti-Cheat enabled','Remote Play on Tablet'],'Steam Timeline'],'Remote Play on Phone','Remote Play on Tablet','In-App Purchases',...,['VR Only'],['Co-op',['MMO',['Partial Controller Support',['Early Access','VR Support'],'Web Publishing',['Free To Play',['Partial Controller Support'],'Commentary available']
0,6482.0,7500.0,1121.0,46.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,124438.0,172801.0,6697.0,14426.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15155.0,20604.0,1878.0,285.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3822.0,3492.0,1195.0,55.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1146642.0,1025633.0,120619.0,50817.0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)