In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [72]:
df = pd.read_csv('C:/Datasets/Artem/Videogame_Sales.csv', encoding = 'cp1251')

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17416 entries, 0 to 17415
Data columns (total 15 columns):
Name               17416 non-null object
Platform           17416 non-null object
Year_of_Release    17408 non-null float64
Genre              17416 non-null object
Publisher          17415 non-null object
NA_Sales           17416 non-null float64
EU_Sales           17416 non-null float64
JP_Sales           17416 non-null float64
Other_Sales        17416 non-null float64
Global_Sales       17416 non-null float64
Critic_Score       8336 non-null float64
Critic_Count       8336 non-null float64
User_Score         7798 non-null float64
User_Count         7798 non-null float64
Rating             10252 non-null object
dtypes: float64(10), object(5)
memory usage: 2.0+ MB


In [74]:
# Настройки отображения
pd.set_option('display.max_columns', 50) 

In [75]:
np.sum(df.duplicated()) # Дублей не обнаружено

0

In [76]:
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.54,76.0,51.0,8.0,324.0,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.8,3.79,3.29,35.57,82.0,73.0,8.3,712.0,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.95,3.28,2.95,32.78,80.0,73.0,8.0,193.0,E
4,Pokemon Red/Pokemon Blue,G,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,


In [77]:
# Заменим пропуск в предикторе Year_of_Release медианным значением
Year_median = df['Year_of_Release'].median()
df['Year_of_Release'].fillna(Year_median, inplace = True)

In [78]:
#Заменим пропуск в предикторе Publisher медианным значением
df['Publisher'].fillna('Electronic Arts', inplace = True)

In [79]:
df['Rating'].value_counts()

E       4120
T       3045
M       1599
E10+    1473
EC         8
RP         3
K-A        3
AO         1
Name: Rating, dtype: int64

In [80]:
# Поскольку число пропусков велико, создадим для них отдельную категорию. 
# Также уберем редкие категории EC, RP, K-A
df['Rating'].fillna('Other', inplace = True)
df['Rating'].replace(['EC', 'RP', 'K-A', 'AO'], 'Other', inplace = True )

In [81]:
#Также уберем редкие категории в предикторе Platform
for i in df['Platform'].unique():
    if df[df['Platform'] == i]['Platform'].count() < 25:
        df['Platform'].replace(i, 'DS', inplace = True)

In [82]:
# Признак Year_of_Release нужно сделать категориальным, изменив тип данных на str. Также можно объединить редкие категории
df['Year_of_Release'] = round(df['Year_of_Release'], 0)
df['Year_of_Release'] = df['Year_of_Release'].astype('str', copy = False)

for i in df['Year_of_Release'].unique():
    if df[df['Year_of_Release'] == i]['Year_of_Release'].count() < 45:
        df['Year_of_Release'].replace(i, '2009.0', inplace = True)

In [83]:
# Признак Publisher имеет много редких категорий. Объединение редких категорий по порогу привело к его увеличению почти в 3 раза.
# Возможно, нужно было несколько уменьшить порог и сделать mean_target для большего числа категорий.
for i in df['Publisher'].unique():
    if df[df['Publisher'] == i]['Publisher'].count() < 35:
        df['Publisher'].replace(i, 'Electronic Arts', inplace = True)

In [84]:
# Заменяем с помощью frequency encoding категории в предикторе Name
for i in df['Name'].unique():
    df['Name'].replace(i, df[df['Name'] == i]['Name'].count(), inplace = True)

In [85]:
# Применяем mean_target кодирование для признака Publisher
for i in df['Publisher'].unique():
    df['Publisher'].replace(i, df.loc[(df['Publisher'] == i) , 'NA_Sales'].mean(), inplace = True)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17416 entries, 0 to 17415
Data columns (total 15 columns):
Name               17416 non-null int64
Platform           17416 non-null object
Year_of_Release    17416 non-null object
Genre              17416 non-null object
Publisher          17416 non-null float64
NA_Sales           17416 non-null float64
EU_Sales           17416 non-null float64
JP_Sales           17416 non-null float64
Other_Sales        17416 non-null float64
Global_Sales       17416 non-null float64
Critic_Score       8336 non-null float64
Critic_Count       8336 non-null float64
User_Score         7798 non-null float64
User_Count         7798 non-null float64
Rating             17416 non-null object
dtypes: float64(10), int64(1), object(4)
memory usage: 2.0+ MB


In [87]:
train = df.sample(frac = 0.7, random_state = 123)
test = df.drop(train.index)
from sklearn.utils import shuffle
test = shuffle(test)

In [88]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12191 entries, 11067 to 3179
Data columns (total 15 columns):
Name               12191 non-null int64
Platform           12191 non-null object
Year_of_Release    12191 non-null object
Genre              12191 non-null object
Publisher          12191 non-null float64
NA_Sales           12191 non-null float64
EU_Sales           12191 non-null float64
JP_Sales           12191 non-null float64
Other_Sales        12191 non-null float64
Global_Sales       12191 non-null float64
Critic_Score       5813 non-null float64
Critic_Count       5813 non-null float64
User_Score         5477 non-null float64
User_Count         5477 non-null float64
Rating             12191 non-null object
dtypes: float64(10), int64(1), object(4)
memory usage: 1.5+ MB


In [89]:
#Пропуски в предикторах Critic_Score, Critic_Count, User_Score, User_Count скорее всего сделаны не случайно. 
# Сначала я предположил, что игра еще не вышло, однако дата релиза у всех различается
# Решил заменить средними значениями в разрезе категории Genre
def inputation(train, test):
    for i in train['Genre'].unique():
        train.loc[(train['Genre'] == i) & train['Critic_Score'].isnull(), 'Critic_Score'] = round(train['Critic_Score'].mean(), 0)
        train.loc[(train['Genre'] == i) & train['Critic_Count'].isnull(), 'Critic_Count'] = round(train['Critic_Count'].mean(), 0)
        train.loc[(train['Genre'] == i) & train['User_Score'].isnull(), 'User_Score'] = round(train['User_Score'].mean(), 0)
        train.loc[(train['Genre'] == i) & train['User_Count'].isnull(), 'User_Count'] = round(train['User_Count'].mean(), 0)
        test.loc[(test['Genre'] == i) & test['Critic_Score'].isnull(), 'Critic_Score'] = round(test['Critic_Score'].mean(), 0)
        test.loc[(test['Genre'] == i) & test['Critic_Count'].isnull(), 'Critic_Count'] = round(test['Critic_Count'].mean(), 0)
        test.loc[(test['Genre'] == i) & test['User_Score'].isnull(), 'User_Score'] = round(test['User_Score'].mean(), 0)
        test.loc[(test['Genre'] == i) & test['User_Count'].isnull(), 'User_Count'] = round(test['User_Count'].mean(), 0)

In [90]:
inputation(train, test)

In [91]:
train.nunique()

Name                10
Platform            24
Year_of_Release     25
Genre               12
Publisher           63
NA_Sales           357
EU_Sales           269
JP_Sales           203
Other_Sales        137
Global_Sales       553
Critic_Score        80
Critic_Count       100
User_Score          94
User_Count         737
Rating               5
dtype: int64

In [92]:
y_train = train['NA_Sales'] 
y_test = test['NA_Sales']
X_train = train.drop('NA_Sales', axis = 1)
X_test = test.drop('NA_Sales', axis = 1)

In [94]:
# Выполняем дамми-кодирование
X_train_dum = pd.get_dummies(X_train)
X_test_dum = pd.get_dummies(X_test)

In [96]:
X_test_dum

Unnamed: 0,Name,Publisher,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Platform_2600,Platform_3DS,Platform_DC,Platform_DS,Platform_G,Platform_GBA,Platform_GC,Platform_GEN,Platform_N64,Platform_NES,Platform_PC,Platform_PS,Platform_PS2,Platform_PS3,Platform_PS4,...,Year_of_Release_2009.0,Year_of_Release_2010.0,Year_of_Release_2011.0,Year_of_Release_2012.0,Year_of_Release_2013.0,Year_of_Release_2014.0,Year_of_Release_2015.0,Year_of_Release_2016.0,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy,Rating_E,Rating_E10+,Rating_M,Rating_Other,Rating_T
3912,2,0.108370,0.00,0.51,0.00,0.51,69.0,26.0,7.0,176.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1978,1,0.382102,0.11,0.00,0.13,1.05,81.0,31.0,9.0,36.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
15843,1,0.191620,0.01,0.00,0.00,0.02,69.0,26.0,7.0,176.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
16041,8,0.435473,0.01,0.00,0.00,0.02,86.0,20.0,8.2,65.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4036,1,0.165273,0.15,0.05,0.04,0.49,55.0,57.0,5.5,31.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8642,4,0.191620,0.03,0.00,0.02,0.16,61.0,23.0,5.2,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
7152,2,0.073693,0.00,0.23,0.00,0.23,69.0,26.0,7.0,176.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
13488,1,0.382102,0.00,0.05,0.00,0.05,69.0,26.0,7.0,176.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6031,2,0.520600,0.11,0.00,0.04,0.29,58.0,5.0,7.3,8.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
9139,1,0.191620,0.00,0.00,0.01,0.14,65.0,12.0,7.0,176.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0


In [115]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 500, min_samples_split = 2, min_samples_leaf = 1, random_state=152)
forest.fit(X_train_dum , y_train)
print('Правильность на обучающей выборке: {:.3f}'.format(forest.score(X_train_dum , y_train)))
print('Правильность на контрольной выборке: {:.3f}'.format(forest.score(X_test_dum, y_test)))
# По сути правильность тут не самая полезная характеристика

Правильность на обучающей выборке: 0.976
Правильность на контрольной выборке: 0.907


In [116]:
# Считаем MSE
from sklearn.metrics import mean_squared_error
print('MSE на обучающей выборке:', mean_squared_error(y_train, forest.predict(X_train_dum)))
print('MSE на тестовой выборке:', mean_squared_error(y_test, forest.predict(X_test_dum)))

MSE на обучающей выборке: 0.015371778225657435
MSE на тестовой выборке: 0.05859547186670996


In [117]:
y_test.mean()

0.2513167464114802

In [118]:
#попробуем подобрать параметры алгоритма на кросс-валидации
from sklearn.model_selection import cross_val_score, KFold
Kfold = KFold(n_splits=10)

In [119]:
X = pd.concat([X_train_dum, X_test_dum], axis = 0)
y = pd.concat([y_train, y_test], axis = 0)

In [120]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
cross_val = cross_val_score(forest, X,y, scoring = 'neg_mean_squared_error', cv = kfold)
print('Среднее значение MSE на cv:', cross_val.mean())
print('MSE для каждого фолда:', cross_val)

Среднее значение MSE на cv: -0.07302467728853965
MSE для каждого фолда: [-0.0347936  -0.01942258 -0.01032719 -0.04248516 -0.00640577 -0.1043743
 -0.04003829 -0.39233515 -0.06744423 -0.01262052]


In [114]:
grid_params = {'max_features': [14, 28, 56, 76],
              'max_depth': [10, 15, 20]} # Сетка параметров

In [124]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(forest, grid_params, scoring = 'neg_mean_squared_error', cv = Kfold, return_train_score=True)
grid_search.fit(X_train_dum, y_train)

ValueError: could not convert string to float: 'E'

In [131]:
test_score = mean_squared_error(y_test, grid_search.predict(X_test_dum))
print('MSE на тестовой выборке: {:.2f}'.format(test_score))
print('Наилучшие значения параметров: {}'.format(grid_search.best_params_))
print('Наилучшее значение MSE: {:.2f}'.format(grid_search.best_score_))
print('Оптимальная модель:\n{}'.format(grid_search.best_estimator_))

MSE на тестовой выборке: 0.06
Наилучшие значения параметров: {'max_depth': 20, 'max_features': 76}
Наилучшее значение MSE: -0.10
Оптимальная модель:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=76, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=152,
           verbose=0, warm_start=False)


In [133]:
forest.predict(X_test_dum)

array([0.0000e+00, 8.5806e-01, 4.8000e-04, ..., 9.2260e-02, 4.0020e-02,
       7.7800e-03])