In [8]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

##### CLASSIFICATION #####

df = pd.read_csv('/home/linux/Documents/Test_App/csv_file.csv')

# Dichotomisation des variables catégorielles et de la variable cible
publisher = pd.get_dummies(df['Publisher'], prefix = 'publisher')
platform = pd.get_dummies(df['Platform'], prefix = 'platform')
genre = pd.get_dummies(df['Genre'], prefix = 'genre')

df['y'] = pd.qcut(df['Global_Sales'], q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4])

# Ajout des colonnes dichotomisée au DataFrame
df = df.join(publisher)
df = df.join(platform)
df = df.join(genre)

# Homogénéisation du barème des notes (tout est ramené sur 10)
df['Test_MC'] = df['Test_MC'] / 10
df['Test_JV'] = df['Test_JV'] / 2
df['Players_JV'] = df['Players_JV'] / 2

# Suppression des colonnes inutiles
df = df.drop(['Name','Publisher','Platform','Genre', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year'], axis = 1)
df = df.drop_duplicates()

# Feats & Target
X = df.drop(['y','Global_Sales'], axis = 1)
Y_clf = df['y']
Y_reg = df['Global_Sales']

# Scaler
sc = MinMaxScaler()
X_sc = pd.DataFrame(sc.fit_transform(X), columns = X.columns)

# ARBRE DE DECISION CLASSIFICATION
dtc = DecisionTreeClassifier()
dtc.fit(X_sc, Y_clf)
print(dtc.score(X_sc,Y_clf))

# Save the model as a pickle in a file
with open('dtc.pkl', 'wb') as model_file:
  pickle.dump(dtc, model_file)

# KNN CLASSIFICATION
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_sc, Y_clf)
print(knn_clf.score(X_sc,Y_clf))

# Save the model as a pickle in a file
with open('knn_clf.pkl', 'wb') as model_file:
  pickle.dump(knn_clf, model_file)

##### REGRESSION #####

# ARBRE DE DECISION REGRESSION
dtr = DecisionTreeRegressor()
dtr.fit(X_sc, Y_reg)
y_pred_dtr_train = dtr.predict(X_sc)
mae_dtr = mean_absolute_error(Y_reg, y_pred_dtr_train)
print("mae dtr", mae_dtr)
print("R2 dtr", dtr.score(X_sc, Y_reg))

# Save the model as a pickle in a file
with open('dtr.pkl', 'wb') as model_file:
  pickle.dump(dtr, model_file)

# KNN REGRESSION
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_sc, Y_reg)
y_pred_knn_train = knn_reg.predict(X_sc)
mae_knn = mean_absolute_error(Y_reg, y_pred_knn_train)
print("mae knn", mae_knn)
print("R2 knn", knn_reg.score(X_sc, Y_reg))

# Save the model as a pickle in a file
with open('knn_reg.pkl', 'wb') as model_file:
  pickle.dump(knn_reg, model_file)

X_sc.to_csv('X.csv', index = False)
Y_clf.to_csv('clf_Y.csv', index = False)
Y_reg.to_csv('reg_Y.csv', index = False)

0.9946691752900596
0.6334274067105676
mae dtr 0.008949513954217623
R2 dtr 0.997229203778738
mae knn 0.7813370962684227
R2 knn 0.46110769417773645
