In [1]:
import pandas as pd
import numpy as np
import joblib

import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC,SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor


from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('vg_mc_jv.csv')

# Variable cible
df['y'] = pd.qcut(df['Global_Sales'], q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4])

# Homogénéisation du barème des notes (tout est ramené sur 10)
df['Test_MC'] = df['Test_MC'] / 10
df['Test_JV'] = df['Test_JV'] / 2
df['Players_JV'] = df['Players_JV'] / 2

# Suppression des colonnes inutiles
df = df.drop(['Name','Global_Sales', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)

# Feats & Target
X = df.drop('y', axis = 1)
Y = df['y']

# Séparation cat et num
cat_var = X.select_dtypes('O').columns
num_var = X.drop(cat_var, axis = 1).columns

# Fonction Pipeline
def build_pipeline(algo, scaler, encoder):  

    """Fonction qui génère une pipeline"""    """Choix de l'algorithme en paramètre"""
    
    # Transformateur numérique
    numeric_transformer = make_pipeline(scaler)

    # Transformateur catégorielle
    categorical_transformer = make_pipeline(encoder)

    # Combinaison des transformateur : preprocessor
    preprocessor = ColumnTransformer(
        transformers = [('num', numeric_transformer, num_var),
                        ('cat', categorical_transformer, cat_var)])

    # Pipeline finale
    model = Pipeline(steps = [('preprocessing',preprocessor), 
                                ('modelisation', algo)])
                
    return model

build_pipeline(DecisionTreeClassifier(),MinMaxScaler(), OneHotEncoder())

In [3]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

models = [DecisionTreeClassifier(random_state=0), KNeighborsClassifier(),
          LogisticRegression(random_state=0,max_iter = 1000),SVC(random_state=0), RandomForestClassifier(random_state=0)]

scaler = MinMaxScaler()
encoder = OneHotEncoder(handle_unknown='ignore')
train = []
test = []
a = None
for element in models :
    a = build_pipeline(element, scaler, encoder)
    a.fit(X_train, y_train)
    train.append(a.score(X_train, y_train))
    test.append(a.score(X_test, y_test))
    
index = ['DecisionTreeClassifier', 'KNeighborsClassifier','LogisticRegression','SVC', 'RandomForestClassifier']
results = pd.DataFrame({'train':train,'test':test}, index = index)

results

Unnamed: 0,train,test
DecisionTreeClassifier,0.995296,0.435737
KNeighborsClassifier,0.629165,0.459248
LogisticRegression,0.519012,0.489028
SVC,0.591925,0.507837
RandomForestClassifier,0.995296,0.503135


In [4]:
# Répétition des résultats précédents sans passer par une pipeline pour vérifier que tout est bon

df = pd.read_csv('vg_mc_jv.csv')

# Variable cible
df['y'] = pd.qcut(df['Global_Sales'], q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4])

# Homogénéisation du barème des notes (tout est ramené sur 10)
df['Test_MC'] = df['Test_MC'] / 10
df['Test_JV'] = df['Test_JV'] / 2
df['Players_JV'] = df['Players_JV'] / 2

# Instanciation d'une colonne nombre aléatoire pour ne pas générer de doublons
"""unique = []
for i in range(len(df)):
    unique.append(random.random())
df['unique'] = unique"""

# Suppression des colonnes inutiles
df = df.drop(['Name','Global_Sales', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)

# Feats & Target
X = df.drop('y', axis = 1)
Y = df['y']

# Séparation cat et num
cat_var = X.select_dtypes('O').columns
num_var = X.drop(cat_var, axis = 1).columns

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 0)

"""unique = []
for i in range(len(X_train)):
    unique.append(random.random())
X_train['unique'] = unique

unique = []
for i in range(len(X_test)):
    unique.append(random.random())
X_test['unique'] = unique"""

scaler = MinMaxScaler()
X_train_num_sc = pd.DataFrame(scaler.fit_transform(X_train[num_var])) # 2551x6
X_test_num_sc = pd.DataFrame(scaler.transform(X_test[num_var])) # 638x6


encoder = OneHotEncoder(handle_unknown ='ignore')
X_train_cat = pd.DataFrame(encoder.fit_transform(X_train[cat_var]).toarray()) # 2551x160
X_test_cat = pd.DataFrame(encoder.transform(X_test[cat_var]).toarray()) # 638x160

Xtrain = pd.concat([X_train_num_sc,X_train_cat],axis = 1) 
Xtest = pd.concat([X_test_num_sc,X_test_cat],axis = 1) 

train, test = [], []
clf = DecisionTreeClassifier(random_state=0)
clf.fit(Xtrain, y_train)
train.append(clf.score(Xtrain,y_train))
test.append(clf.score(Xtest,y_test))

knn = KNeighborsClassifier()
knn.fit(Xtrain, y_train)
train.append(knn.score(Xtrain,y_train))
test.append(knn.score(Xtest,y_test))

logreg = LogisticRegression(random_state=0,max_iter = 1000)
logreg.fit(Xtrain, y_train)
train.append(logreg.score(Xtrain,y_train))
test.append(logreg.score(Xtest,y_test))

SVM = SVC(random_state=0)
SVM.fit(Xtrain, y_train)
train.append(SVM.score(Xtrain,y_train))
test.append(SVM.score(Xtest,y_test))

rf = RandomForestClassifier(random_state=0)
rf.fit(Xtrain, y_train)
train.append(rf.score(Xtrain,y_train))
test.append(rf.score(Xtest,y_test))

index = ['DecisionTreeClassifier', 'KNeighborsClassifier','LogisticRegression','SVC', 'RandomForestClassifier']
results = pd.DataFrame({'train':train,'test':test}, index = index)

results


Unnamed: 0,train,test
DecisionTreeClassifier,0.995296,0.435737
KNeighborsClassifier,0.629165,0.459248
LogisticRegression,0.519404,0.489028
SVC,0.591925,0.507837
RandomForestClassifier,0.995296,0.503135


In [5]:
# Etude d'un ADA sur un arbre décision

ada_estim = DecisionTreeClassifier(random_state=0)
ada_estim.fit(Xtrain, y_train)

score = [ada_estim.score(Xtrain, y_train),ada_estim.score(Xtest,y_test)]
index = ['clf test (estimateur)','clf test (estimateur)']
for i in range(1,10):
    ada = AdaBoostClassifier(estimator = ada_estim, n_estimators=i)
    ada.fit(Xtrain,y_train)
    score.append(ada.score(Xtest,y_test))
    index.append('ADA '+str(i)+' estimateur(s)')

results_adaclf = pd.DataFrame({'Score':score}, index = index)
display(results_adaclf)

# Etude d'un ADA sur un random forest

ada_estim = RandomForestClassifier(max_depth = 10)
ada_estim.fit(Xtrain, y_train)
a = ada_estim.score(Xtrain,y_train)
b = ada_estim.score(Xtest,y_test)

score = [a,b]
index = ['RF train (estimateur)','RF test (estimateur)']
for i in range(1,10):
    ada = AdaBoostClassifier(estimator = ada_estim, n_estimators=i)
    ada.fit(Xtrain,y_train)
    score.append(ada.score(Xtest,y_test))
    index.append('ADA '+str(i)+' estimateur(s)')

results_adarf = pd.DataFrame({'Score':score}, index = index)
display(results_adarf)

Unnamed: 0,Score
clf test (estimateur),0.995296
clf test (estimateur),0.435737
ADA 1 estimateur(s),0.437304
ADA 2 estimateur(s),0.432602
ADA 3 estimateur(s),0.438871
ADA 4 estimateur(s),0.442006
ADA 5 estimateur(s),0.443574
ADA 6 estimateur(s),0.443574
ADA 7 estimateur(s),0.446708
ADA 8 estimateur(s),0.434169


Unnamed: 0,Score
RF train (estimateur),0.716582
RF test (estimateur),0.479624
ADA 1 estimateur(s),0.487461
ADA 2 estimateur(s),0.489028
ADA 3 estimateur(s),0.49373
ADA 4 estimateur(s),0.50627
ADA 5 estimateur(s),0.507837
ADA 6 estimateur(s),0.507837
ADA 7 estimateur(s),0.517241
ADA 8 estimateur(s),0.510972


In [5]:
##### ETUDE EN REGRESSION #####

df = pd.read_csv('vg_mc_jv.csv')

# Homogénéisation du barème des notes (tout est ramené sur 10)
df['Test_MC'] = df['Test_MC'] / 10
df['Test_JV'] = df['Test_JV'] / 2
df['Players_JV'] = df['Players_JV'] / 2

# Instanciation d'une colonne nombre aléatoire pour ne pas générer de doublons
"""unique = []
for i in range(len(df)):
    unique.append(random.random())
df['unique'] = unique"""

# Suppression des colonnes inutiles
df = df.drop(['Name', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)
df = df.drop_duplicates()
# Feats & Target
X = df.drop('Global_Sales', axis = 1)
Y = df['Global_Sales']

# Séparation cat et num
cat_var = X.select_dtypes('O').columns
num_var = X.drop(cat_var, axis = 1).columns

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

models = [DecisionTreeRegressor(random_state=0), KNeighborsRegressor(), SVR(), RandomForestRegressor(random_state=0)]
mae_train, mae_test = [], []
mse_train, mse_test = [], []
rmse_train, rmse_test = [], []
a = None

for element in models :
    a = build_pipeline(element, MinMaxScaler(), OneHotEncoder(handle_unknown='ignore'))
    a.fit(X_train, y_train)
    y_pred_train = a.predict(X_train)
    y_pred_test = a.predict(X_test)

    # jeu d'entraînement 
    mae_train.append(mean_absolute_error(y_train,y_pred_train))
    mse_train.append(mean_squared_error(y_train,y_pred_train,squared=True))
    rmse_train.append(mean_squared_error(y_train,y_pred_train,squared=False))

    # jeu de test 
    mae_test.append(mean_absolute_error(y_test,y_pred_test))
    mse_test.append(mean_squared_error(y_test,y_pred_test,squared=True))
    rmse_test.append(mean_squared_error(y_test,y_pred_test,squared=False))
    
index = ['DecisionTreeRegressor', 'KNeighborsRegressor','SVR','RandomForestRegressor']

results = pd.DataFrame({'MAE train':mae_train,'MAE test':mae_test,
                        'MSE train':mse_train,'MSE test':mse_test,
                        'RMSE train':rmse_train,'RMSE test':rmse_test},
                          index = index)

results

Unnamed: 0,MAE train,MAE test,MSE train,MSE test,RMSE train,RMSE test
DecisionTreeRegressor,0.009467,0.996183,0.025098,6.269753,0.158424,2.503948
KNeighborsRegressor,0.82385,0.914461,4.82261,5.397976,2.196044,2.323354
SVR,0.756606,0.782924,6.889564,5.183988,2.624798,2.276837
RandomForestRegressor,0.361199,0.83531,1.110806,3.901617,1.053948,1.975251


In [7]:
##### RETOUR SUR LE JEU DE DONNEES INITIALES #####

df = pd.read_csv('vg_sales_clean.csv').drop('Unnamed: 0', axis = 1)
# Variable cible
df['y'] = pd.qcut(df['Global_Sales'], q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4])

# Suppression des colonnes inutiles
df = df.drop(['Name','Global_Sales', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)
df = df.drop_duplicates()
# Feats & Target
X = df.drop('y', axis = 1)
Y = df['y']

# Séparation cat et num
cat_var = X.select_dtypes('O').columns
num_var = X.drop(cat_var, axis = 1).columns

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

models = [DecisionTreeClassifier(random_state=0), KNeighborsClassifier(),
          LogisticRegression(random_state=0,max_iter = 1000),SVC(random_state=0), RandomForestClassifier(random_state=0)]

scaler = MinMaxScaler()
encoder = OneHotEncoder(handle_unknown='ignore')
train = []
test = []
a = None
for element in models :
    a = build_pipeline(element, scaler, encoder)
    a.fit(X_train, y_train)
    train.append(a.score(X_train, y_train))
    test.append(a.score(X_test, y_test))
    
index = ['DecisionTreeClassifier', 'KNeighborsClassifier','LogisticRegression','SVC', 'RandomForestClassifier']
results = pd.DataFrame({'train':train,'test':test}, index = index)

results

Unnamed: 0,train,test
DecisionTreeClassifier,0.643816,0.203417
KNeighborsClassifier,0.4345,0.262002
LogisticRegression,0.447111,0.349878
SVC,0.49105,0.288039
RandomForestClassifier,0.643816,0.202604


In [8]:
df = pd.read_csv('vg_sales_clean.csv').drop('Unnamed: 0', axis = 1)

# Suppression des colonnes inutiles
df = df.drop(['Name', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)
df = df.drop_duplicates()
# Feats & Target
X = df.drop('Global_Sales', axis = 1)
Y = df['Global_Sales']

# Séparation cat et num
cat_var = X.select_dtypes('O').columns
num_var = X.drop(cat_var, axis = 1).columns

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

models = [DecisionTreeRegressor(random_state=0), KNeighborsRegressor(), SVR(), RandomForestRegressor(random_state=0)]
mae_train, mae_test = [], []
mse_train, mse_test = [], []
rmse_train, rmse_test = [], []
a = None

for element in models :
    a = build_pipeline(element, MinMaxScaler(), OneHotEncoder(handle_unknown='ignore'))
    a.fit(X_train, y_train)
    y_pred_train = a.predict(X_train)
    y_pred_test = a.predict(X_test)

    # jeu d'entraînement 
    mae_train.append(mean_absolute_error(y_train,y_pred_train))
    mse_train.append(mean_squared_error(y_train,y_pred_train,squared=True))
    rmse_train.append(mean_squared_error(y_train,y_pred_train,squared=False))

    # jeu de test 
    mae_test.append(mean_absolute_error(y_test,y_pred_test))
    mse_test.append(mean_squared_error(y_test,y_pred_test,squared=True))
    rmse_test.append(mean_squared_error(y_test,y_pred_test,squared=False))
    
index = ['DecisionTreeRegressor', 'KNeighborsRegressor','SVR','RandomForestRegressor']

results = pd.DataFrame({'MAE train':mae_train,'MAE test':mae_test,
                        'MSE train':mse_train,'MSE test':mse_test,
                        'RMSE train':rmse_train,'RMSE test':rmse_test},
                          index = index)
results

Unnamed: 0,MAE train,MAE test,MSE train,MSE test,RMSE train,RMSE test
DecisionTreeRegressor,0.512273,0.55859,2.527467,1.616714,1.589801,1.271501
KNeighborsRegressor,0.634306,0.690354,3.056639,2.220442,1.748325,1.490115
SVR,0.471662,0.473901,3.084231,1.708993,1.756198,1.307285
RandomForestRegressor,0.520622,0.551986,2.535125,1.587955,1.592208,1.260141
