<a href="https://colab.research.google.com/github/mrree1078/Hotel-System/blob/main/VideoGameSales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

# suppress warnings from final output
import warnings
warnings.simplefilter("ignore")

In [None]:
#load data
df = pd.read_csv('/content/clean_vgsales.csv')

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
sorted(list(df.Year.unique()))

In [None]:
df.Genre.value_counts()

# Q1: What are the top 5 genres of high sales?

In [None]:
# let's see the average Global_Sales by genres
df.groupby('Genre')['Global_Sales'].mean().sort_values()

In [None]:
# Visulization
plt.figure(figsize=(15,5))
plt.bar(range(0,12), list(df.groupby('Genre')['Global_Sales'].mean().sort_values().values), tick_label=list(df.groupby('Genre')['Global_Sales'].mean().sort_values().index))
plt.title('Genres by average sales number')
plt.xlabel('Genres')
plt.ylabel('Average sales number (millions)');
plt.ylim(0,1);

# Q2: What are the top 5 Publishers of the high sales?

In [None]:
# let's see the average Global_Sales by publishers
df.groupby('Publisher')['Global_Sales'].mean().sort_values()[-5:]

In [None]:
#Visualization
plt.figure(figsize=(12,5))
plt.bar([1, 2, 3,4,5], [1.205, 1.30042328, 1.592 , 2.1725, 2.56354885], tick_label=['Sony Entertainment', 'Hello Games',
       'Valve', 'RedOctane', 'Nintendo'])
plt.title('The top 5 publisher by average high sales ')
plt.xlabel('Top 5 publishers')
plt.ylabel('Sales number (millions)');
plt.xticks(rotation=15);

# Q3: What are the top 5 Platforms of the high sales?

In [None]:
# let's see the average Global_Sales by platforms
df.groupby('Platform')['Global_Sales'].mean().sort_values()[-5:].values

In [None]:
#Visualization
plt.figure(figsize=(9,5))
plt.bar([1, 2, 3,4,5], [0.8608805, 0.92192488, 0.97875, 2.59473684, 2.67357895], tick_label=['PS4', 'SNES', 'GEN', 'NES', 'GB'])
plt.title('The top 5 Platforms by average high sales ')
plt.xlabel('Top 5 Platforms')
plt.ylabel('Sales number (millions)');

# Q4: Which 5 year of range from 1980 - 2020 has the highest global sales?

In [None]:
# Visualization

plt.figure(figsize=(20,5))
# set bin edges, compute centers
bin_size = 1
xbin_edges = np.arange(1980, df['Year'].max()+bin_size, bin_size)
xbin_centers = (xbin_edges + bin_size/2)[:-1]

# compute statistics in each bin
data_xbins = pd.cut(df['Year'], xbin_edges, right = False, include_lowest = True)
y_means = df['Global_Sales'].groupby(data_xbins).mean()
y_sems = df['Global_Sales'].groupby(data_xbins).sem()  #std

# plot the summarized data
plt.errorbar(x = xbin_centers, y = y_means, yerr = y_sems)
plt.xlabel('Year')
plt.ylabel('Average Global Sales(millions)');
plt.xticks(range(1980,2021,2), range(1980,2021,2));
plt.title('Game Average Global Sales(millions) by years');

# Q5: What is the relationship between sales numbers in 4 different main regions from 1980 - 2020?

In [None]:
# Visualization

plt.figure(figsize=(20,5))
# set bin edges, compute centers
bin_size = 1
xbin_edges = np.arange(1980, df['Year'].max()+bin_size, bin_size)
xbin_centers = (xbin_edges + bin_size/2)[:-1]

# compute statistics in each bin
data_xbins = pd.cut(df['Year'], xbin_edges, right = False, include_lowest = True)
y_means = df['NA_Sales'].groupby(data_xbins).mean()
plt.errorbar(x = xbin_centers, y = y_means)
y_means = df['EU_Sales'].groupby(data_xbins).mean()
plt.errorbar(x = xbin_centers, y = y_means)
y_means = df['JP_Sales'].groupby(data_xbins).mean()
plt.errorbar(x = xbin_centers, y = y_means)
y_means = df['Other_Sales'].groupby(data_xbins).mean()
plt.errorbar(x = xbin_centers, y = y_means)

# plot the summarized data
plt.xlabel('Year')
plt.ylabel('Average Sales(millions)');
plt.xticks(range(1980,2021,2), range(1980,2021,2));
plt.title('Game Average Global Sales(millions) by years');
plt.legend(title="Regions", labels=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'])

# Q6: What is the relation with Top 5 Platforms and years?

In [None]:
# Visualization
#I only need the data from top platform
Top5platform_df = df[df['Platform'].isin(['PS4', 'SNES', 'GEN', 'NES', 'GB'])]
plt.figure(figsize=(20,4))
base_color = sb.color_palette()[0]
sb.boxplot(data = Top5platform_df, x = 'Year', y = 'Platform', color = base_color)
plt.xlabel('Year')
plt.ylabel('Platforms');
plt.xticks(range(1980,2021,2), range(1980,2021,2));
plt.title('Platforms by years');