In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

%config InlineBackend.figure_format = 'retina'



df=pd.read_csv(r'C:/Users/vpdrn/pp/Video_Games.csv',encoding='cp949')
df.head(10)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count
0,Wii Sports,Wii,2006.0,Sports,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0
1,Super Mario Bros.,NES,1985.0,Platform,29.08,3.58,6.81,0.77,40.24,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0
3,Wii Sports Resort,Wii,2009.0,Sports,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,11.27,8.89,10.22,1.0,31.37,,,,
5,Tetris,GB,1989.0,Puzzle,23.2,2.26,4.22,0.58,30.26,,,,
6,New Super Mario Bros.,DS,2006.0,Platform,11.28,9.14,6.5,2.88,29.8,89.0,65.0,8.5,431.0
7,Wii Play,Wii,2006.0,Misc,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,14.44,6.94,4.7,2.24,28.32,87.0,80.0,8.4,594.0
9,Duck Hunt,NES,1984.0,Shooter,26.93,0.63,0.28,0.47,28.31,,,,


In [2]:
df = df.rename(columns={"Year_of_Release": "Year"})
df.shape

(16715, 13)

In [3]:
df.duplicated().sum()

0

In [4]:
df.isna().sum()

Name               2
Platform           0
Year             269
Genre              2
NA_Sales           0
EU_Sales           0
JP_Sales           0
Other_Sales        0
Global_Sales       0
Critic_Score    8578
Critic_Count    8578
User_Score      6701
User_Count      9125
dtype: int64

In [5]:
df = df.dropna().copy()

In [6]:
df_top_genre = df.groupby(['Genre'], as_index=False)['Global_Sales'].sum()
df_top_genre = df_top_genre.sort_values(['Global_Sales'], ascending=False)

In [7]:
le = LabelEncoder()

df['Name'] = le.fit_transform(df['Name'])
df['Platform'] = le.fit_transform(df['Platform'])
df['Genre'] = le.fit_transform(df['Genre'])

In [8]:
X = df.drop(columns=['Global_Sales' ,'JP_Sales', 'Other_Sales'])
y = df['Global_Sales']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
#선형회귀

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [11]:
def model_score(lr, model_name='Model Name'):
    print(f'Score of {model_name} Model: {lr.score(X_test, y_test) * 100}%')
    
def mse(lr_pred, model_name='Model Name'):
    print('MSE: {} of {} model'.format(mean_squared_error(y_test, lr_pred), model_name))
    
def mae(lr_pred, model_name='Model Name'):
    print('MAE: {} of {} model'.format(mean_absolute_error(y_test, lr_pred), model_name))

In [12]:
print('선형회귀 평가지표')
model_score(lr, 'Linear Regression')
mse(lr_pred, 'Linear Regression')
mae(lr_pred, 'Linear Regression')

선형회귀 평가지표
Score of Linear Regression Model: 97.54239098549802%
MSE: 0.06767995892002626 of Linear Regression model
MAE: 0.10768666757951004 of Linear Regression model


In [13]:
#결정트리

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)

In [14]:
print('결정트리 평가지표')
model_score(dtr, 'Decision Tree Regressor')
mse(dtr_pred, 'Decision Tree Regressor')
mae(dtr_pred, 'Decision Tree Regressor')

결정트리 평가지표
Score of Decision Tree Regressor Model: 89.30331501754158%
MSE: 0.29457541696881806 of Decision Tree Regressor model
MAE: 0.1577012327773749 of Decision Tree Regressor model


In [15]:
#랜덤포레스트

rfg = RandomForestRegressor()
rfg.fit(X_train, y_train)
rfg_pred = rfg.predict(X_test)

In [16]:
print(f'Score of Random Forest Regressor Model: {rfg.score(X_test, y_test) * 100}%')
print(f'Mean Absolute Error: ' + str(mean_absolute_error(rfg_pred, y_test)) + ' - Random Forest Regressor')

Score of Random Forest Regressor Model: 96.32006457017623%
Mean Absolute Error: 0.10013633067440175 - Random Forest Regressor


In [17]:
print('랜덤포레스트 평가지표')
model_score(rfg, 'Random Forest Regressor')
mse(rfg_pred, 'Random Forest Regressor')
mae(rfg_pred, 'Random Forest Regressor')

랜덤포레스트 평가지표
Score of Random Forest Regressor Model: 96.32006457017623%
MSE: 0.10134153856417695 of Random Forest Regressor model
MAE: 0.10013633067440175 of Random Forest Regressor model
