# VIDEO GAMES SALES PREDICTIONS (2024)

### Library

In [1]:
# Data Table 
import numpy as np                       # matrices & arrays
import pandas as pd                      # Data Table & dataframe 
from skimpy import skim                  # skim data
from prettytable import PrettyTable      # Create Tables

# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt

# Hypothesis Testing 
import scipy.stats as sps                 # statistical tests
from scipy.stats.mstats import winsorize  # Winsorizing
import statsmodels.api as sm              # regression
from statsmodels.formula.api import ols   # regression model
from scipy.stats import boxcox            # ideal way to transform skewed to normal

# Machine Learning 
import sklearn as sklearn                 # scikit learn package
from sklearn.preprocessing import PolynomialFeatures
                                          # Polynomial Features
from sklearn.decomposition import PCA     # PCA
from sklearn.feature_extraction import DictVectorizer
                                          # Categorical encoding
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, LabelBinarizer, OneHotEncoder
                                          # Categorical Encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
                                          # Continuous Scalers 
from sklearn.model_selection import train_test_split
                                          # split train & test set
from sklearn.model_selection import KFold, cross_val_predict
                                          # Cross-validation 
from sklearn.pipeline import Pipeline
                                          # Pipeline the cross validation

from sklearn.model_selection import GridSearchCV
                                          # Grid Search CV 
from sklearn.metrics import r2_score, mean_squared_error
                                          # Model Evaluation Metrics


# Options
import warnings
warnings.filterwarnings('ignore')         # suppress all warnings; switch 'ignore' to 'default' to to re-enable it again
pd.set_option('display.max_rows', 500)    # display max rows 
pd.set_option('display.max_columns', 500) #         max cols
pd.set_option('display.width', 1000)      #         max width
pd.set_option('display.precision', 2)     #         round 2 places after decimal 

### About

### The Dataset

In [2]:
# importing
sales = pd.read_csv('sales.csv')

# rename 
sales.columns = ['name','platform','release_year',
                 'genre','publisher','na_sales','eu_sales',
                 'jp_sales','other_sales','global_sales',
                 'critic_scores','critic_count','user_scores',
                 'user_count','developer','rating']

# Information
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           16717 non-null  object 
 1   platform       16719 non-null  object 
 2   release_year   16450 non-null  float64
 3   genre          16717 non-null  object 
 4   publisher      16665 non-null  object 
 5   na_sales       16719 non-null  float64
 6   eu_sales       16719 non-null  float64
 7   jp_sales       16719 non-null  float64
 8   other_sales    16719 non-null  float64
 9   global_sales   16719 non-null  float64
 10  critic_scores  8137 non-null   float64
 11  critic_count   8137 non-null   float64
 12  user_scores    7590 non-null   float64
 13  user_count     7590 non-null   float64
 14  developer      10096 non-null  object 
 15  rating         9950 non-null   object 
dtypes: float64(10), object(6)
memory usage: 2.0+ MB


### FUNCTIONS

##### nadup()

In [3]:
# Functions for calculation
def nadup(df):
    arr, arr2, arr3, arr4 = [], [], [], []
    for col in df.columns:
      temp = [str(x) for x in df[col].unique()]    # convert cols to str
      temp2 = df[col].isna().sum()                 # calculate sum of NaN    
      temp3 = (df[col].isna().sum())/len(df)*100   # calculate % of NaN
      arr.append(', '.join(temp))                   
      arr2.append(len(temp))
      arr3.append(temp2)
      arr4.append(round(temp3,1))
    print('The dataframe has a total of %i rows & %i columns. A total of %i NA values were detected.\n' %(df.shape[0],df.shape[1],df.isnull().any(axis=1).sum()),
          ' This dataframe has',df.duplicated().sum(),'duplicated rows')
    summary = pd.DataFrame({
        'Variables': df.columns,
        'Type':df.dtypes.to_list(),
        'Unique Values':arr,
        'Sum of Unique Values': arr2,
        'Sum of NaN Values': arr3,
        '% of NaN': arr4}).sort_values('% of NaN', ascending = False)
    display(summary)

# DATA CLEANING 

### Duplicates

In [4]:
nadup(sales)

The dataframe has a total of 16719 rows & 16 columns. A total of 9894 NA values were detected.
  This dataframe has 0 duplicated rows


Unnamed: 0,Variables,Type,Unique Values,Sum of Unique Values,Sum of NaN Values,% of NaN
12,user_scores,float64,"8.0, nan, 8.3, 8.5, 6.6, 8.4, 8.6, 7.7, 6.3, 7...",96,9129,54.6
13,user_count,float64,"322.0, nan, 709.0, 192.0, 431.0, 129.0, 594.0,...",889,9129,54.6
10,critic_scores,float64,"76.0, nan, 82.0, 80.0, 89.0, 58.0, 87.0, 91.0,...",83,8582,51.3
11,critic_count,float64,"51.0, nan, 73.0, 65.0, 41.0, 80.0, 64.0, 63.0,...",107,8582,51.3
15,rating,object,"E, nan, M, T, E10+, K-A, AO, EC, RP",9,6769,40.5
14,developer,object,"Nintendo, nan, Good Science Studio, Rockstar N...",1697,6623,39.6
2,release_year,float64,"2006.0, 1985.0, 2008.0, 2009.0, 1996.0, 1989.0...",40,269,1.6
4,publisher,object,"Nintendo, Microsoft Game Studios, Take-Two Int...",583,54,0.3
0,name,object,"Wii Sports, Super Mario Bros., Mario Kart Wii,...",11563,2,0.0
1,platform,object,"Wii, NES, GB, DS, X360, PS3, PS2, SNES, GBA, P...",31,0,0.0


### NA values

# Exploratory Data Analysis (EDA)

### Pairplot 

# Pre-processing 

### Standardization 

### Outliers

### Features Engineering 

# Data Splits

### Train-Test Split

### Cross-validation 

# Supervised ML Model - Regression

### Model 1

### Model 2

### Model 3

### Evaluation Metric

##### Comparison Table

##### Plot of all Prediction

# CONCLUSION

### Minh K. Chau