# VIDEO GAMES SALES PREDICTIONS (2024)

### Library

In [2]:
# Data Table 
import numpy as np                       # matrices & arrays
import pandas as pd                      # Data Table & dataframe 
from skimpy import skim                  # skim data
from prettytable import PrettyTable      # Create Tables

# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt

# Hypothesis Testing 
import scipy.stats as sps                 # statistical tests
from scipy.stats.mstats import winsorize  # Winsorizing
import statsmodels.api as sm              # regression
from statsmodels.formula.api import ols   # regression model
from scipy.stats import boxcox            # ideal way to transform skewed to normal

# Machine Learning 
import sklearn as sklearn                 # scikit learn package
from sklearn.preprocessing import PolynomialFeatures
                                          # Polynomial Features
from sklearn.decomposition import PCA     # PCA
from sklearn.feature_extraction import DictVectorizer
                                          # Categorical encoding
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, LabelBinarizer, OneHotEncoder
                                          # Categorical Encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
                                          # Continuous Scalers 
from sklearn.model_selection import train_test_split
                                          # split train & test set
from sklearn.model_selection import KFold, cross_val_predict
                                          # Cross-validation 
from sklearn.pipeline import Pipeline
                                          # Pipeline the cross validation

from sklearn.model_selection import GridSearchCV
                                          # Grid Search CV 
from sklearn.metrics import r2_score, mean_squared_error
                                          # Model Evaluation Metrics


# Options
import warnings
warnings.filterwarnings('ignore')         # suppress all warnings; switch 'ignore' to 'default' to to re-enable it again
pd.set_option('display.max_rows', 500)    # display max rows 
pd.set_option('display.max_columns', 500) #         max cols
pd.set_option('display.width', 1000)      #         max width
pd.set_option('display.precision', 2)     #         round 2 places after decimal 

### About

### The Dataset

In [4]:
# importing
sales = pd.read_csv('sales.csv')
scores = pd.read_csv('scores.csv')

In [5]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       7590 non-null   float64
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(10), object(6)
memory usage: 2.0+ MB


In [6]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17944 entries, 0 to 17943
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        17944 non-null  object
 1   platform    17944 non-null  object
 2   r-date      17944 non-null  object
 3   score       17944 non-null  int64 
 4   user score  17944 non-null  object
 5   developer   17944 non-null  object
 6   genre       17944 non-null  object
 7   players     17922 non-null  object
 8   critics     17944 non-null  int64 
 9   users       17944 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 1.4+ MB


In [7]:
# scores has more rows (let's slap sales data into scores)

In [None]:
# step 1. renaming 
scores.columns = ['name','platform','released_year',