In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV

from imblearn.over_sampling import SMOTE

import sklearn
print(sklearn.__version__)


plt.rcParams["figure.figsize"] = (30, 10)
plt.rcParams["font.size"] = 18


1.2.0


In [2]:
steam_df = pd.read_csv("Data/steam/games-features-edit.csv")

print(steam_df['RecommendationCount'])

0        68991
1         2439
2         2319
3          888
4         2934
         ...  
12619        0
12620        0
12621        0
12622        0
12623        0
Name: RecommendationCount, Length: 12624, dtype: int64


In [3]:
# # extract the metacritic score, recommendation count, and name of the game
# steam_df = steam_df[['Metacritic', 'RecommendationCount', 'GenreIsNonGame']]
# # drop rows with missing values
# steam_df = steam_df.dropna()
# # drop rows with 0 recommendation count
# steam_df = steam_df[steam_df['RecommendationCount'] != 0]
# # drop rows with 0 metacritic score
# steam_df = steam_df[steam_df['Metacritic'] != 0]

# steam_df = steam_df[steam_df['GenreIsNonGame'] == False]


# #remove outliers from recommendation count
# steam_df = steam_df[steam_df['RecommendationCount'] < 100000]
# # steam_df = steam_df[steam_df['PriceInitial'] < 80]
# print(len(steam_df))

# plt.scatter(steam_df['Metacritic'], steam_df['RecommendationCount'])
# plt.xlabel('Metacritic Score')
# plt.ylabel('PriceInitial')
# # plt.xticks(fontsize=18)
# # plt.yticks(fontsize=18)
# plt.show()


In [4]:
def select_genres(genres_list, df):
	df = df[(df['Genre'] == 'GenreIs' + genres_list[0]) | (df['Genre'] == 'GenreIs' + genres_list[1])]
	return df

In [5]:
#make a new dataframe with only the ResponseName, ReleaseDate, Metacritic, RecommendationCount, IsFree, and PriceInitial
newDF = steam_df[['ResponseName', 'ReleaseDate', 'Metacritic', 'RecommendationCount', 'IsFree', 'PriceInitial']]

genres = ["GenreIsIndie","GenreIsAction","GenreIsAdventure","GenreIsCasual","GenreIsStrategy","GenreIsRPG","GenreIsSimulation","GenreIsEarlyAccess","GenreIsFreeToPlay","GenreIsSports","GenreIsRacing","GenreIsMassivelyMultiplayer"]
temp = []

rows_genres = []

for index, row in steam_df.iterrows():
	for genre in genres:
		if row[genre] == 1:
			temp.append(genre)
	if len(temp)==1:
		rows_genres.append(temp[0])
	temp = []

def init_df(genres_to_use, rows_genres, df):
	# print((rows_genres))
	genresDF = pd.DataFrame(rows_genres, columns = ['Genre'], dtype=str)
	genresDF = select_genres(genres_to_use, genresDF)

	df = pd.concat([df, genresDF], axis=1)
	df = df.dropna(subset=['Genre'])
	df = df[df['Metacritic'] != 0]
	# print(len(df['Genre']))

	return df



In [6]:

def plot_genres(df):
    #plot the genres and their recommendation counts
    fig, ax = plt.subplots(1,3)
    ax[0].scatter(df['Genre'], df['PriceInitial'])
    ax[0].set_xlabel('Genre')
    ax[0].set_ylabel('PriceInitial')
    ax[1].scatter(df['Genre'], df['RecommendationCount'])
    ax[1].set_xlabel('Genre')
    ax[1].set_ylabel('RecommendationCount')
    ax[2].scatter(df['Genre'], df['Metacritic'])
    ax[2].set_xlabel('Genre')
    ax[2].set_ylabel('Metacritic')
    plt.show()


In [7]:
#split the data into training and testing sets
def train_set(df):
    X = df[['Metacritic', 'RecommendationCount', 'PriceInitial']]
    y = df['Genre']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)

    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    # create a k-fold cross validation iterator
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)

    # define the grid of hyperparameters to search over
    param_grid = {
        "n_estimators": [100, 200, 300, 400, 500],
        "max_depth": [5, 10, 15, 20, 25],
        "min_samples_split": [2, 5, 10, 15, 20],
        "random_state": [42]
    }

    # create the random forest classifier
    clf = RandomForestClassifier()

    # create a grid search object to find the best hyperparameters
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=kfold, n_jobs=-1)

    # fit the grid search on the data
    grid_search.fit(X_train, y_train)


    # print the best hyperparameters found
    return grid_search.best_estimator_, X_test, y_test, X_train, y_train


In [8]:
def random_forest(genres_to_use, rows_genres, newDF):
    df = init_df(genres_to_use, rows_genres, newDF)
    # plot_genres(df)
    rfc, X_test, y_test, X_train, y_train = train_set(df)
    cf = RandomForestClassifier(n_estimators=rfc.n_estimators, random_state=rfc.random_state, max_depth=rfc.max_depth, min_samples_split=rfc.min_samples_split)
    cf.fit(X_train, y_train)
    y_pred = cf.predict(X_train)
    print(accuracy_score(y_train, y_pred))

    y_pred = cf.predict(X_test)
    print(accuracy_score(y_test, y_pred))


In [9]:
print("Indie vs Action")
random_forest(["Indie","Action"], rows_genres, newDF)

print("Indie vs Simulation")
random_forest(["Indie","Simulation"], rows_genres, newDF)

print("Indie vs Strategy")
random_forest(["Indie","Strategy"], rows_genres, newDF)

print("Action vs Simulation")
random_forest(["Action","Simulation"], rows_genres, newDF)

Indie vs Action
0.9725274725274725
0.5784313725490197
Indie vs Simulation
0.9854014598540146
0.5
Indie vs Strategy
0.9316770186335404
0.5342465753424658
Action vs Simulation
0.9869888475836431
0.6595744680851063
