# VIDEO GAMES SALES PREDICTIONS (2024)

### Library

In [None]:
# Data Table 
import numpy as np                       # matrices & arrays
import pandas as pd                      # Data Table & dataframe 
from skimpy import skim                  # skim data
from prettytable import PrettyTable      # Create Tables

# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt

# Hypothesis Testing 
import scipy.stats as sps                 # statistical tests
from scipy.stats.mstats import winsorize  # Winsorizing
import statsmodels.api as sm              # regression
from statsmodels.formula.api import ols   # regression model
from scipy.stats import boxcox            # ideal way to transform skewed to normal

# Machine Learning 
import sklearn as sklearn                 # scikit learn package
from sklearn.preprocessing import PolynomialFeatures
                                          # Polynomial Features
from sklearn.decomposition import PCA     # PCA
from sklearn.feature_extraction import DictVectorizer
                                          # Categorical encoding
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, LabelBinarizer, OneHotEncoder
                                          # Categorical Encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
                                          # Continuous Scalers 
from sklearn.model_selection import train_test_split
                                          # split train & test set
from sklearn.model_selection import KFold, cross_val_predict
                                          # Cross-validation 
from sklearn.pipeline import Pipeline
                                          # Pipeline the cross validation

from sklearn.model_selection import GridSearchCV
                                          # Grid Search CV 
from sklearn.metrics import r2_score, mean_squared_error
                                          # Model Evaluation Metrics


# Options
import warnings
warnings.filterwarnings('ignore')         # suppress all warnings; switch 'ignore' to 'default' to to re-enable it again
pd.set_option('display.max_rows', 500)    # display max rows 
pd.set_option('display.max_columns', 500) #         max cols
pd.set_option('display.width', 1000)      #         max width
pd.set_option('display.precision', 2)     #         round 2 places after decimal 

### Functions

##### nadup()

In [None]:
# Functions for calculation
def nadup(df):
    arr, arr2, arr3, arr4 = [], [], [], []
    for col in df.columns:
      temp = [str(x) for x in df[col].unique()]    # convert cols to str
      temp2 = df[col].isna().sum()                 # calculate sum of NaN    
      temp3 = (df[col].isna().sum())/len(df)*100   # calculate % of NaN
      arr.append(', '.join(temp))                   
      arr2.append(len(temp))
      arr3.append(temp2)
      arr4.append(round(temp3,1))
    print('The dataframe has a total of %i rows & %i columns. A total of %i NA values were detected.\n' %(df.shape[0],df.shape[1],df.isnull().any(axis=1).sum()),
          ' This dataframe has',df.duplicated().sum(),'duplicated rows')
    summary = pd.DataFrame({
        'Variables': df.columns,
        'Type':df.dtypes.to_list(),
        'Unique Values':arr,
        'Sum of Unique Values': arr2,
        'Sum of NaN Values': arr3,
        '% of NaN': arr4}).sort_values('% of NaN', ascending = False)
    display(summary)

### About the dataset
1. Where is it from? (i.e., who was the author, where did I get it)
2. What is it about? 

In [None]:
# importing
sales = pd.read_csv('sales.csv')

# rename 
sales.columns = ['name','platform','release_year',
                 'genre','publisher','na_sales','eu_sales',
                 'jp_sales','other_sales','global_sales',
                 'critic_scores','critic_count','user_scores',
                 'user_count','developer','rating']

# Information
sales.info()

### Goals
What are the goals I would like to accomplish with this dataset
- Descriptive. What have we done in the past? 
- Predictive. Where are we going and when?
- Prescriptive. How should we take action? 
- Diagnostic. Why have we seen past results? 

# Methods

### Variables of interest
|Variables|Class|Type|Definition|Interpretation|
|---------|-----|----|----------|--------------|
|GDP|continuous|outcome|The amount of money a country make in a year, in US dollars|Higher GDP = Richer country|
|Age|continuous|predictor|a person age, in year|higher age = older|
|Sex|categorical|predictor|a person's biological sex|M = male, F = female|

### Analysis Plan
1. How data will be processed and clean (i.e., duplicates, NA values, outliers) 
2. Exploratory Data Analysis
    - What are some relationship between predictors and outcomes?
    - Do we need a pairplot? 
    - What are the correlation between all predictors and outcomes?
    - If any, what are some hypotheses testing we should use? 
3. Pre-processing steps
    - Do we need to perform feature engineering? If so, why?
    - Will this be a classification or regression model? 
4. Data split 
    - How many percentage will be test data? 
    - Do we perform cross-validation? 
5. Building the model
    - What models will we used? (linear regression, ridge, lasso, etc.)
    - What evaluation metric will we used? (R2, confusion matrix, etc.)
    - Comparison table
    - Plot of all prediction 
6. Conclusion.

# DATA CLEANING 

### Duplicates

In [None]:
nadup(sales)

### NA values

# Exploratory Data Analysis (EDA)

### Pairplot 

# Pre-processing 

### Standardization 

### Outliers

### Features Engineering 

# Data Splits

### Train-Test Split

### Cross-validation 

# Supervised ML Model - Regression

### Model 1

### Model 2

### Model 3

### Evaluation Metric

##### Comparison Table

##### Plot of all Prediction

# CONCLUSION

### Minh K. Chau