In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from datetime import date

import psycopg2
from sqlalchemy import create_engine

from config import db_password

%matplotlib notebook

In [None]:
games_df = pd.read_csv("../Resources/vgsales.csv.zip")
ratings_df = pd.read_csv("../Resources/final_scrape5.csv")

In [None]:
sns.boxplot(data=games_df, x="Platform", y="Global_Sales")

In [None]:
ratings_df.rename({"game":"Name"},axis=1, inplace=True)

In [None]:
ratings_df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
ratings_df

In [None]:
games_df.loc[games_df["Name"]=="Grand Theft Auto V"]

In [None]:
print(games_df["Platform"].value_counts().index.tolist())
print(ratings_df["Platform"].value_counts().index.tolist())

In [None]:
platform_conversions = {
    'playstation-2': "PS2",
    'xbox-360': "X360",
    'playstation-3': "PS3",
    'ds': "DS",
    'wii': "Wii",
    'xbox': "XB", 
    'psp': "PSP",
    'gamecube': "GC",
    'game-boy-advance': "GBA",
    'playstation-4': "PS4",
    'playstation':"PS",
    'playstation-vita':"PSV",
    'wii-u':"WiiU",
    'nintendo-64':"N64",
    'dreamcast':"DC"
}

ratings_df["Platform"] = ratings_df["Platform"].map(lambda x: platform_conversions[x])
ratings_df

In [None]:
import re
ratings_df["metascore"] = ratings_df["metascore"].str.extract("\n(\d+)\n", flags=re.IGNORECASE)

In [None]:
ratings_df["developer"] = ratings_df["developer"].str.extract("\s*\n\s*(\w+)\s*\n*\s*", flags=re.IGNORECASE)

In [None]:
ratings_df["number_players"].value_counts().index.tolist()

In [None]:
def fix_num_players(x):
    if x in ('1-4 ', 'Up to 4 ', "1-2 ", "2 ","1-3 "):
        return "local multiplayer"
    elif x in ('2  Online','Up to 8 ',"1-8 ","1-5",'4  Online','Up to 16 ','8  Online','Up to 12 ','Up to 6 ',
               'Up to 10 ','16  Online',"1-5 ",'6  Online','1-6 ','1-16 ','10  Online','Up to 18 ',
               'Up to 24 ','Up to 22 ','24  Online','12  Online','Up to 20 ','Up to 3 ',
               '32  Online','14  Online','Up to 14 ','Online Multiplayer','1-12 ','Up to 64 ',
               '1-10 ','Up to 32 ','Up to 5 ','Up to 40 ','Up to 30 ','44  Online','Up to 60 ',
               '5  Online'):
        return "Online Multiplayer"
    elif x in ("Up to more than 64 ", "Massively Multiplayer"):
        return"Massively Multiplayer"
    else:
        return x

ratings_df["number_players"] = ratings_df["number_players"].apply(fix_num_players)
ratings_df.number_players.value_counts()

In [None]:
combo_df = pd.merge(games_df, ratings_df, how="inner", left_on=["Name","Platform"], right_on=["Name", "Platform"])
combo_df

In [None]:
# see other columns
combo_df.iloc[:,6:]

In [None]:
combo_df.columns.tolist()

In [None]:
# Gather all games for each platform that there is no rating data for.
# no_rating_df = new_df.loc[new_df["release_date"].isnull()]
# no_rating_df.to_csv("../Resources/games_to_get_ratings.csv")

In [None]:
combo_df[combo_df["Name"]=="Grand Theft Auto V"]

In [None]:
all_consoles_games_df = games_df.groupby("Name").sum().drop(["Rank","Year"],axis=1).sort_values("Global_Sales", ascending=False)

In [None]:
all_consoles_games_df

In [None]:
sales_by_platform = games_df.groupby("Platform").sum().drop(["Rank","Year"],axis=1).sort_values("Global_Sales",ascending=False)

In [None]:
# Should we include all the platforms, or only more recent ones since we will be trying
# to predict sales on only the most recent consoles and the market has changed quite a bit.
# For example, the growth of the digital sales market.
sales_by_platform

In [None]:
# Create handheld column
handheld_platforms = ["WS", "PSV", "3DS", "GB", "PSP", "GBA", "DS", "GG", "SCD","PSV"]
at_home_platforms = ["PCFX", "3DO","TG16", "DC", "SAT", "WiiU", "2600", "XOne","GC", "SNES", "N64",
                    "NES", "XB", "PC", "PS4","PS", "Wii", "PS3","X360","PS2", "GEN","NG",]

In [None]:
# Create deprecated console column. "No longer in production"
# Suspect, should not be in deprecated_consoles list: ["PSV", "3DS", "Wii",]
# "GG" is for "Sega Game Gear", "NG" is for "Neo Geo" arcades, "SCD" is for 
# "Nintendo Suplemental Computing Device" and "GEN" is for "Sega Genesis"
# I Placed NG in the deprecated consoles list because arcade games are not relevant to how we will use the data. Will consider deleting it entirely.
deprecated_consoles = ["PCFX", "3DO","TG16","WS", "DC", "SAT", "PSV", "2600","GC", "SNES", "N64", "3DS",
                      "NES","GB","XB", "PSP", "GBA","PS", "DS","PS3","Wii","X360","PS2", "GEN", "NG","GG"]

# These are the deprecated consoles without "PS3", "X360", and "Wii". These consoles are recent 
# enouth that the patterns in sales can still be used to make predictions based on the most 
# recent consoles like PS4, Xbox One, and Wii-U.
retro_consoles = ["PCFX", "3DO","TG16","WS", "DC", "SAT", "PSV", "2600","GC", "SNES", "N64", "3DS",
                      "NES","GB","XB", "PSP", "GBA","PS", "DS","PS2", "GEN", "NG","GG"]



In [None]:
# feature engineering a column for whether the game was handheld or not 
combo_df["is_handheld"] = combo_df["Platform"].apply(lambda x: "yes" if x in handheld_platforms else "no")

In [None]:
# feature engineering a column for whether the game is deprecated or not
combo_df["is_deprecated"] = combo_df["Platform"].apply(lambda x: "yes" if x in deprecated_consoles else "no")

In [None]:
# feature engineering a column for whether the game is "retro" or not
combo_df["is_retro"] = combo_df["Platform"].apply(lambda x: "yes" if x in retro_consoles else "no")

In [None]:
combo_df.head()

In [None]:
combo_df["is_retro"].value_counts()

In [None]:
# Time to feature engineer the date column
combo_df["release_date"] = pd.to_datetime(combo_df["release_date"])

In [None]:
combo_df.dtypes

In [None]:
combo_df["year"] = combo_df["release_date"].apply(lambda x: x.year)

In [None]:
combo_df["month"] = combo_df["release_date"].apply(lambda date: date.month)

In [None]:
combo_df.iloc[:,6:].head()

In [None]:
# Is the publisher the same thing as the developer? Sometimes the two columns are the same and sometimes they are different. Should investigate.
combo_df.head()

In [None]:
# Time to get rid of null rows
sns.heatmap(combo_df.isnull(), yticklabels=False, cbar=False, cmap="viridis")

In [None]:
# The likely reason for the 126 null years is that some of the release dates came in a form 
# that couldn't be converted to a datetime. Will inspect release date column for these null 
# rows. The metascore nulls were likley a problem with the scraping. Many of the pages on 
# metacritic.com simply didnt have info on the number of players. Several possible fixes 
# explored below.

combo_df.isnull().sum()

In [None]:
list_of_nonum_players = combo_df[combo_df["number_players"].isnull()]["Name"].tolist()

In [None]:
list_of_nonum_players[0:25]

In [None]:
combo_df[combo_df.Name.isin(list_of_nonum_players)]["Platform"].value_counts()

In [None]:
# About half of the Wii games don't have a value for number of players...
# I will put them into a new category simply titled "Unknown" because this column is important
# to try and glean insight from for the games we can and even if this value is not there, 
# I don't want to lose of all the good information in these rows.
combo_df[combo_df.Platform=="Wii"]

In [None]:
combo_df["number_players"] = combo_df["number_players"].apply(lambda x: x if pd.notnull(x) else "ambiguous")

In [None]:
# I havent bothered to look at many of the dtypes yet, because I will lose this info when
# I send the file to a csv. The dtypes will be changed in the deep learning notebook.

In [None]:
combo_df[combo_df["metascore"].isnull()]["Name"].tolist()[0:25]

In [None]:
# for now I will drop the nulls so i can start export the dataframe and import it into my 
# deep_learning notebook for scaling

In [None]:
combo_df.drop("Year", axis=1, inplace=True)


In [None]:
combo_df["rating"].value_counts()

# Finding the rows with commas in them
combo_df[combo_df["positive_users"].str.contains("\d,\d{3}")].iloc[:,5:]

combo_df["positive_users"]= combo_df["positive_users"].str.replace(",","")
combo_df["negative_users"]= combo_df["negative_users"].str.replace(",","")

In [None]:
combo_df["user_score"] = combo_df["user_score"].map(lambda x: np.nan if x.isalpha() else x)

In [None]:
combo_df.to_csv("../Resources/cleaned_df_for_MI.csv", index=False)

In [None]:
db_string = f"postgres://postgres:{db_password}@final-project-db.celqxz4aecqm.us-east-1.rds.amazonaws.com/games_db"
engine =create_engine(db_string)
combo_df.to_sql(name="games", con=engine, if_exists="replace")