In [14]:
import cx_Oracle
import pandas as pd
import numpy as np

db_username_read = open('db_username.txt')
db_username = db_username_read.read()
db_username_read.close()

db_password_read = open('db_password.txt')
db_password = db_password_read.read()
db_password_read.close()

db_host_read = open('db_host.txt')
db_host = db_host_read.read()
db_host_read.close()

connection = cx_Oracle.connect(user=db_username, password=db_password, dsn=db_host)
cursor = connection.cursor()

# Wyciągnięcie danych z bazy
query = """
SELECT gs.game_name, p.platform_name, pub.publisher_name, d.developer_name, g.genre_name, r.rating_name, gs.na_sales, gs.eu_sales, gs.jp_sales, gs.other_sales, gs.critic_score, gs.critic_count, gs.user_score, gs.user_count, gs.release_date
FROM game_sales gs
JOIN platforms p ON gs.platform_id = p.platform_id
JOIN publishers pub ON gs.publisher_id = pub.publisher_id
JOIN developers d ON gs.developer_id = d.developer_id
JOIN genres g ON gs.genre_id = g.genre_id
JOIN ratings r ON gs.rating_id = r.rating_id
"""
cursor.execute(query)

columns = ['Game_Name', 'Platform', 'Publisher', 'Developer', 'Genre', 'Rating', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Release_Date']
data = cursor.fetchall()
df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,Game_Name,Platform,Publisher,Developer,Genre,Rating,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Release_Date
0,New Super Luigi U,WiiU,Nintendo,Nintendo,Platform,E,1250000,620000,180000,180000,77,59,7.9,288,2013
1,Tetris DS,DS,Nintendo,Nintendo,Puzzle,E,630000,50000,1350000,80000,84,56,8.7,44,2006
2,Classic NES Series: Super Mario Bros.,GBA,Nintendo,Nintendo,Platform,E,0,0,1390000,30000,84,14,8.6,44,2004
3,Pikmin 2,GC,Nintendo,Nintendo,Strategy,E,480000,130000,560000,30000,90,54,9.1,137,2004
4,Mario vs. Donkey Kong,GBA,Nintendo,Nintendo,Puzzle,E,680000,250000,210000,20000,81,43,8.0,31,2004


In [15]:
df = df[(df['Platform'] == 'PS3') | (df['Platform'] == 'PS4') | (df['Platform'] == 'X360') | (df['Platform'] == 'XOne') | (df['Platform'] == 'Wii') | (df['Platform'] == 'WiiU') | (df['Platform'] == 'PC')]
df = df[df['Release_Date'] > 2003]
df['Total_Sales'] = df['NA_Sales'] + df['EU_Sales'] + df['JP_Sales'] + df['Other_Sales']
df['NA_Sales'] = np.log1p(df['NA_Sales'])
df['EU_Sales'] = np.log1p(df['EU_Sales'])
df['JP_Sales'] = np.log1p(df['JP_Sales'])
df['Other_Sales'] = np.log1p(df['Other_Sales'])
df['Total_Sales'] = np.log1p(df['Total_Sales'])

In [16]:
devs = len(df['Developer'].unique())
pubs = len(df['Publisher'].unique())
rats = len(df['Rating'].unique())
gens = len(df['Genre'].unique())
plats = len(df['Platform'].unique())
print('Developers: ', devs,'\nPublishers: ', pubs,'\nRatings: ', rats,'\nGenres: ', gens,'\nPlatforms: ', plats)

Developers:  762 
Publishers:  165 
Ratings:  5 
Genres:  12 
Platforms:  7


In [17]:
df = pd.get_dummies(data=df, columns=['Platform', 'Genre', 'Rating'])

In [18]:
X = df[['Release_Date', 'Critic_Score', 'Critic_Count',
       'User_Score', 'User_Count', 'Platform_PC', 'Platform_PS3',
       'Platform_PS4', 'Platform_Wii', 'Platform_WiiU', 'Platform_X360',
       'Platform_XOne', 'Genre_Action', 'Genre_Adventure', 'Genre_Fighting',
       'Genre_Misc', 'Genre_Platform', 'Genre_Puzzle', 'Genre_Racing',
       'Genre_Role-Playing', 'Genre_Shooter', 'Genre_Simulation',
       'Genre_Sports', 'Genre_Strategy', 'Rating_E', 'Rating_E10+', 'Rating_M',
       'Rating_RP', 'Rating_T', 'NA_Sales']]

y = df[['Total_Sales']]

print(X.shape)
print(y.shape)

(3174, 30)
(3174, 1)


In [19]:
y = np.ravel(y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

grid_search_lr = GridSearchCV(LinearRegression(), {}, scoring='neg_mean_squared_error',  cv=5)
grid_search_lr.fit(X_train, Y_train)

# Calculate the RMSE for the best cross-validation score
lr_best_cross_val_score = (np.sqrt(-grid_search_lr.best_score_))
print("Best cross-validation score: {:.2f}".format(np.expm1(lr_best_cross_val_score)))
y_pred = grid_search_lr.predict(X_test)

# Convert predictions and actual values back to original scale
y_pred_original_scale = np.expm1(y_pred)
Y_test_original_scale = np.expm1(Y_test)

# Obliczenie RMSE dla zestawu testowego
test_rmse = np.sqrt(mean_squared_error(Y_test_original_scale, y_pred_original_scale))
print("Test RMSE score: {:.2f}".format(test_rmse))

# Obliczenie R^2 dla zestawu testowego
r2_score_value = r2_score(Y_test_original_scale, y_pred_original_scale)
print("Test R^2 score: {:.2f}".format(r2_score_value))

Best cross-validation score: 1.29
Test RMSE score: 1695758.13
Test R^2 score: 0.22


In [21]:
ind = np.random.choice(Y_test_original_scale.shape[0], size=10, replace=False)

predicted_Total_Sales = pd.DataFrame({
    'Actual Total_Sales': Y_test_original_scale[ind].astype('int'),
    'Predicted Total_Sales': y_pred_original_scale[ind].astype('int')
})

predicted_Total_Sales['Difference'] = predicted_Total_Sales['Actual Total_Sales'] - predicted_Total_Sales['Predicted Total_Sales']
predicted_Total_Sales['Difference_in_%'] = np.abs(predicted_Total_Sales['Difference']) / ((predicted_Total_Sales['Actual Total_Sales'] + predicted_Total_Sales['Predicted Total_Sales']) / 2) * 100

print(predicted_Total_Sales)

   Actual Total_Sales  Predicted Total_Sales  Difference  Difference_in_%
0             3009999                 833350     2176649       113.268350
1               80000                 172172      -92172        73.102486
2              100000                 226324     -126324        77.422439
3              139999                 136312        3687         2.668732
4              509999                 623194     -113195        19.978062
5              219999                 172320       47679        24.306241
6              379999                 242220      137779        44.286336
7              339999                 174208      165791        64.484147
8             6320000                5347492      972508        16.670386
9              909999                 633538      276461        35.821752


In [22]:
param_grid_rf = {'n_estimators': [3, 10, 30, 50, 70], 'max_features': [2,4,6,8,10,12], 'max_depth': [2, 3, 5, 7, 9]}

In [23]:
from sklearn.ensemble import RandomForestRegressor

grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, Y_train)

# Calculate the RMSE for the best cross-validation score
rf_best_cross_val_score = np.sqrt(-grid_search_rf.best_score_)
print("Cross-validated RMSE score: {:.2f}".format(np.expm1(rf_best_cross_val_score)))

y_pred = grid_search_rf.predict(X_test)

# Convert predictions and actual values back to original scale
y_pred_original_scale = np.expm1(y_pred)
Y_test_original_scale = np.expm1(Y_test)

# Calculate the RMSE on the original scale
test_rmse = np.sqrt(mean_squared_error(Y_test_original_scale, y_pred_original_scale))
print("Test RMSE score: {:.2f}".format(test_rmse))

# Calculating R^2 score for the test set
r2_score_value = r2_score(Y_test_original_scale, y_pred_original_scale)
print("Test R^2 score: {:.2f}".format(r2_score_value))

Cross-validated RMSE score: 0.68
Test RMSE score: 941486.40
Test R^2 score: 0.76


In [24]:
predicted_Total_Sales = pd.DataFrame({
    'Actual Total_Sales': Y_test_original_scale[ind].astype('int'),
    'Predicted Total_Sales': y_pred_original_scale[ind].astype('int')
})

predicted_Total_Sales['Difference'] = predicted_Total_Sales['Actual Total_Sales'] - predicted_Total_Sales['Predicted Total_Sales']
predicted_Total_Sales['Difference_in_%'] = np.abs(predicted_Total_Sales['Difference']) / ((predicted_Total_Sales['Actual Total_Sales'] + predicted_Total_Sales['Predicted Total_Sales']) / 2) * 100

print(predicted_Total_Sales)

   Actual Total_Sales  Predicted Total_Sales  Difference  Difference_in_%
0             3009999                1819760     1190239        49.287718
1               80000                  55716       24284        35.786495
2              100000                 133443      -33443        28.651962
3              139999                 132717        7282         5.340354
4              509999                 653935     -143936        24.732674
5              219999                 176428       43571        21.981853
6              379999                 328125       51874        14.651106
7              339999                 377802      -37803        10.533003
8             6320000                5950682      369318         6.019519
9              909999                 974667      -64668         6.862542


In [25]:
param_grid_gbr = {'n_estimators': [200, 225, 250, 275], 'max_features': [6, 8, 10, 12], 'max_depth': [5, 7, 9]}

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

grid_search_gbr = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
grid_search_gbr.fit(X_train, Y_train)

# Calculate the RMSE for the best cross-validation score
gbr_best_cross_val_score = np.sqrt(-grid_search_gbr.best_score_)
print("Cross-validated RMSE score: {:.2f}".format(np.expm1(gbr_best_cross_val_score)))

y_pred = grid_search_gbr.predict(X_test)

# Convert predictions and actual values back to original scale
y_pred_original_scale = np.expm1(y_pred)
Y_test_original_scale = np.expm1(Y_test)

# Calculate the RMSE on the original scale
test_rmse = np.sqrt(mean_squared_error(Y_test_original_scale, y_pred_original_scale))
print("Test RMSE score: {:.2f}".format(test_rmse))

# Calculating R^2 score for the test set
r2_score_value = r2_score(Y_test_original_scale, y_pred_original_scale)
print("Test R^2 score: {:.2f}".format(r2_score_value))

Cross-validated RMSE score: 0.64
Test RMSE score: 847463.50
Test R^2 score: 0.81


In [27]:
predicted_Total_Sales = pd.DataFrame({
    'Actual Total_Sales': Y_test_original_scale[ind].astype('int'),
    'Predicted Total_Sales': y_pred_original_scale[ind].astype('int')
})

predicted_Total_Sales['Difference'] = predicted_Total_Sales['Actual Total_Sales'] - predicted_Total_Sales['Predicted Total_Sales']
predicted_Total_Sales['Difference_in_%'] = np.abs(predicted_Total_Sales['Difference']) / ((predicted_Total_Sales['Actual Total_Sales'] + predicted_Total_Sales['Predicted Total_Sales']) / 2) * 100

print(predicted_Total_Sales)

   Actual Total_Sales  Predicted Total_Sales  Difference  Difference_in_%
0             3009999                2241092      768907        29.285609
1               80000                  61943       18057        25.442607
2              100000                 116428      -16428        15.181030
3              139999                 149533       -9534         6.585801
4              509999                 696077     -186078        30.856762
5              219999                 142881       77118        42.503307
6              379999                 493799     -113800        26.047210
7              339999                 335217        4782         1.416436
8             6320000                6466061     -146061         2.284691
9              909999                1143916     -233917        22.777671


In [28]:
X = df[['Release_Date', 'Critic_Score', 'Critic_Count',
       'User_Score', 'User_Count', 'Platform_PC', 'Platform_PS3',
       'Platform_PS4', 'Platform_Wii', 'Platform_WiiU', 'Platform_X360',
       'Platform_XOne', 'Genre_Action', 'Genre_Adventure', 'Genre_Fighting',
       'Genre_Misc', 'Genre_Platform', 'Genre_Puzzle', 'Genre_Racing',
       'Genre_Role-Playing', 'Genre_Shooter', 'Genre_Simulation',
       'Genre_Sports', 'Genre_Strategy', 'Rating_E', 'Rating_E10+', 'Rating_M',
       'Rating_RP', 'Rating_T']]

y = df[['Total_Sales']]

In [29]:
y = np.ravel(y)

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
grid_search_gbr = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
grid_search_gbr.fit(X_train, Y_train)

# Calculate the RMSE for the best cross-validation score
gbr_best_cross_val_score = np.sqrt(-grid_search_gbr.best_score_)
print("Cross-validated RMSE score: {:.2f}".format(np.expm1(gbr_best_cross_val_score)))

y_pred = grid_search_gbr.predict(X_test)

# Convert predictions and actual values back to original scale
y_pred_original_scale = np.expm1(y_pred)
Y_test_original_scale = np.expm1(Y_test)

# Calculate the RMSE on the original scale
test_rmse = np.sqrt(mean_squared_error(Y_test_original_scale, y_pred_original_scale))
print("Test RMSE score: {:.2f}".format(test_rmse))

# Calculating R^2 score for the test set
r2_score_value = r2_score(Y_test_original_scale, y_pred_original_scale)
print("Test R^2 score: {:.2f}".format(r2_score_value))

Cross-validated RMSE score: 1.32
Test RMSE score: 1395096.62
Test R^2 score: 0.48


In [31]:
predicted_Total_Sales = pd.DataFrame({
    'Actual Total_Sales': Y_test_original_scale[ind].astype('int'),
    'Predicted Total_Sales': y_pred_original_scale[ind].astype('int')
})

predicted_Total_Sales['Difference'] = predicted_Total_Sales['Actual Total_Sales'] - predicted_Total_Sales['Predicted Total_Sales']
predicted_Total_Sales['Difference_in_%'] = np.abs(predicted_Total_Sales['Difference']) / ((predicted_Total_Sales['Actual Total_Sales'] + predicted_Total_Sales['Predicted Total_Sales']) / 2) * 100

print(predicted_Total_Sales)

   Actual Total_Sales  Predicted Total_Sales  Difference  Difference_in_%
0             3009999                2394535      615464        22.775840
1               80000                  65191       14809        20.399336
2              100000                 202867     -102867        67.928827
3              139999                 189977      -49978        30.291900
4              509999                 452034       57965        12.050522
5              219999                 292892      -72893        28.424363
6              379999                 313132       66867        19.294188
7              339999                 116093      223906        98.184577
8             6320000                6906486     -586486         8.868357
9              909999                1189407     -279408        26.617815


In [32]:
X = df[['Release_Date', 'Critic_Score', 'Critic_Count',
       'User_Score', 'User_Count', 'Platform_PC', 'Platform_PS3',
       'Platform_PS4', 'Platform_Wii', 'Platform_WiiU', 'Platform_X360',
       'Platform_XOne', 'Genre_Action', 'Genre_Adventure', 'Genre_Fighting',
       'Genre_Misc', 'Genre_Platform', 'Genre_Puzzle', 'Genre_Racing',
       'Genre_Role-Playing', 'Genre_Shooter', 'Genre_Simulation',
       'Genre_Sports', 'Genre_Strategy', 'Rating_E', 'Rating_E10+', 'Rating_M',
       'Rating_RP', 'Rating_T', 'NA_Sales']]

y = df[['EU_Sales']]

In [33]:
y = np.ravel(y)

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
grid_search_gbr = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
grid_search_gbr.fit(X_train, Y_train)

# Calculate the RMSE for the best cross-validation score
gbr_best_cross_val_score = np.sqrt(-grid_search_gbr.best_score_)
print("Cross-validated RMSE score: {:.2f}".format(np.expm1(gbr_best_cross_val_score)))

y_pred = grid_search_gbr.predict(X_test)

# Convert predictions and actual values back to original scale
y_pred_original_scale = np.expm1(y_pred)
Y_test_original_scale = np.expm1(Y_test)

# Calculate the RMSE on the original scale
test_rmse = np.sqrt(mean_squared_error(Y_test_original_scale, y_pred_original_scale))
print("Test RMSE score: {:.2f}".format(test_rmse))

# Calculating R^2 score for the test set
r2_score_value = r2_score(Y_test_original_scale, y_pred_original_scale)
print("Test R^2 score: {:.2f}".format(r2_score_value))

Cross-validated RMSE score: 21.39
Test RMSE score: 485550.16
Test R^2 score: 0.55


In [35]:
ind = np.random.choice(Y_test_original_scale.shape[0], size=10, replace=False)

predicted_EU_Sales = pd.DataFrame({
    'Actual EU_Sales': Y_test_original_scale[ind].astype('int'),
    'Predicted EU_Sales': y_pred_original_scale[ind].astype('int')
})

predicted_EU_Sales['Difference'] = predicted_EU_Sales['Actual EU_Sales'] - predicted_EU_Sales['Predicted EU_Sales']
predicted_EU_Sales['Difference_in_%'] = np.abs(predicted_EU_Sales['Difference']) / ((predicted_EU_Sales['Actual EU_Sales'] + predicted_EU_Sales['Predicted EU_Sales']) / 2) * 100

print(predicted_EU_Sales)

   Actual EU_Sales  Predicted EU_Sales  Difference  Difference_in_%
0           189999               35979      154020       136.314154
1           100000              128678      -28678        25.081556
2           830000              258172      571828       105.098826
3          1069999              899036      170963        17.365156
4           790000              801689      -11689         1.468754
5            40000               90426      -50426        77.325073
6           150000               18790      131210       155.471296
7          1140000              120402     1019598       161.789334
8            10000               21756      -11756        74.039552
9           230000              102826      127174        76.420712
