In [62]:
# Importing dependencies
import pandas as pd
import numpy as np
import os

# Finding Files
files = os.listdir('Resources')

In [63]:
# Change this number to call different files
f = files[0]
df = pd.read_json(os.path.join('Resources',f))

# Player Count

In [64]:
player_count = len(df.SN.unique())
player_count_df = pd.DataFrame(data=[player_count],columns=['Total Number of Players'])
player_count_df

Unnamed: 0,Total Number of Players
0,573


# Purchase Analysis (Total)

In [65]:
# Number of unique Items
items = df['Item Name'].unique()
num_unique_items = len(items)

# Average Purchase price
avg_price = df['Price'].mean()

# Total Number of Purchases
total_purchases = len(df['Price'])

# Total Revenue
total_revenue = df['Price'].sum()

# Compiling Analysis into a single output table
pa_df = pd.DataFrame(data=[num_unique_items, avg_price, total_purchases, total_revenue])
pa_df = pa_df.transpose()
pa_df[[0,2]] = pa_df[[0,2]].astype(int)


pa_df.rename(columns={0: 'Number of Unique Items',
                      1: 'Average Purchase Price',
                      2: 'Total Number of Purchases',
                      3: 'Total Revenue'},inplace=True)
pa_df = pa_df.style.format({'Average Purchase Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}'})
pa_df

Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,179,$ 2.93,780,"$ 2,286.33"


# Gender Demographics

In [66]:
unique_name_group = df.groupby(by='SN')
gender_counts = unique_name_group['Gender'].max().value_counts()

# Percentage and count of male players
male_count = gender_counts['Male']
male_percent = male_count/player_count*100

# Percentage and count of female players
female_count = gender_counts['Female']
female_percent = female_count/player_count*100

# Percentage and count of other/non-disclosed
other_count = gender_counts['Other / Non-Disclosed']
other_percent = other_count/player_count*100

# Compiling analysis into a single table
gender_df = pd.DataFrame([[male_count, male_percent], [female_count, female_percent], [other_count, other_percent]],index=['Male','Female','Other/Non-Disclosed'],columns=['Count','Percent'])
gender_df = gender_df.style.format({'Percent': '{:,.2f}%'})
gender_df

Unnamed: 0,Count,Percent
Male,465,81.15%
Female,100,17.45%
Other/Non-Disclosed,8,1.40%


# Purchase Analysis (Gender)

In [67]:
male_only_df = df[df['Gender']=='Male']
female_only_df = df[df['Gender']=='Female']
other_only_df = df[df['Gender']=='Other / Non-Disclosed']

# Total Purchases
male_purchase_count = len(male_only_df['Price'])
female_purchase_count = len(female_only_df['Price'])
other_purchase_count = len(other_only_df['Price'])

# Average Purchase Price
male_avg_price = male_only_df['Price'].mean()
female_avg_price = female_only_df['Price'].mean()
other_avg_price = other_only_df['Price'].mean()

# Total Purchase Value
male_total_purchase = male_only_df['Price'].sum()
female_total_purchase = female_only_df['Price'].sum()
other_total_purchase = other_only_df['Price'].sum()

# Normalized Totals
male_avg_price_norm = male_total_purchase/male_count
female_avg_price_norm = female_total_purchase/female_count
other_avg_price_norm = other_total_purchase/other_count

# Compiling analysis into a single table
pa_gender_df = pd.DataFrame([[male_purchase_count, male_avg_price, male_total_purchase, male_avg_price_norm],
                             [female_purchase_count, female_avg_price, female_total_purchase, female_avg_price_norm],
                             [other_purchase_count, other_avg_price, other_total_purchase, other_avg_price_norm]],
                             index=['Male','Female','Other/Non-Disclosed'],
                             columns=['Purchase Count','Average Purchase Price','Total Revenue','Normalized Price'])
pa_gender_df = pa_gender_df.style.format({'Average Purchase Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}','Normalized Price': '$ {:,.2f}'})
pa_gender_df

Unnamed: 0,Purchase Count,Average Purchase Price,Total Revenue,Normalized Price
Male,633,$ 2.95,"$ 1,867.68",$ 4.02
Female,136,$ 2.82,$ 382.91,$ 3.83
Other/Non-Disclosed,11,$ 3.25,$ 35.74,$ 4.47


# Age Demographics

In [68]:
# The following code works for any age range, but is weak against outliers.  Will hard code bins so the data works with all files

#bins = [10+5*i for i in range(int(np.floor((df['Age'].max()-5)/5)))]
#labels = [str(10+5*i)+'-'+str(9+5*(i+1)) for i in range(len(bins))]
#bins = [0]+bins
#labels = ['<10']+labels

bins = [9+5*i for i in range(7)]
bins = [0]+bins+[150]
labels = ['<10','10-14','15-19','20-24','25-29','30-34','35-39','+40']
bin_series = pd.cut(df['Age'],bins = bins, labels=labels)

# Creating a new df to match up to bins
df_with_bins = df
df_with_bins['Bins'] = bin_series

# Total Purchases
bin_df = pd.DataFrame(bin_series.value_counts())
bin_df.sort_index(inplace=True)
bin_df.rename(columns={'Age':'Purchase Count'},inplace=True)

# Average Purchase Price
under_ten_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='<10'].mean()
ten2fourteen_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='10-14'].mean()
fifteen2nineteen_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='15-19'].mean()
twenty2twentyfour_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='20-24'].mean()
twentyfive2twentynine_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='25-29'].mean()
thirty2thirtyfour_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='30-34'].mean()
thirtyfive2thirtynine_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='35-39'].mean()
over_fourty_avg_price = df_with_bins['Price'][df_with_bins['Bins']=='+40'].mean()

# Total Purchase Value
under_ten_total_price = df_with_bins['Price'][df_with_bins['Bins']=='<10'].sum()
ten2fourteen_total_price = df_with_bins['Price'][df_with_bins['Bins']=='10-14'].sum()
fifteen2nineteen_total_price = df_with_bins['Price'][df_with_bins['Bins']=='15-19'].sum()
twenty2twentyfour_total_price = df_with_bins['Price'][df_with_bins['Bins']=='20-24'].sum()
twentyfive2twentynine_total_price = df_with_bins['Price'][df_with_bins['Bins']=='25-29'].sum()
thirty2thirtyfour_total_price = df_with_bins['Price'][df_with_bins['Bins']=='30-34'].sum()
thirtyfive2thirtynine_total_price = df_with_bins['Price'][df_with_bins['Bins']=='35-39'].sum()
over_fourty_total_price = df_with_bins['Price'][df_with_bins['Bins']=='+40'].sum()

# Normalized Prices
players_by_bin = unique_name_group_with_bins['Bins'].max().value_counts()
player_count_by_age=list()
for age_range in labels:
    player_count_by_age.append(players_by_bin[age_range])
total_revenue = np.array([under_ten_total_price,
                ten2fourteen_total_price,
                fifteen2nineteen_total_price,
                twenty2twentyfour_total_price,
                twentyfive2twentynine_total_price,
                thirty2thirtyfour_total_price,
                thirtyfive2thirtynine_total_price,
                over_fourty_total_price])
player_count_by_age = np.array(player_count_by_age)
normalized_values = total_revenue/player_count_by_age

# Compiling analysis into a single table
bin_df['Average Purchase Price'] = [under_ten_avg_price,
                                    ten2fourteen_avg_price,
                                    fifteen2nineteen_avg_price,
                                    twenty2twentyfour_avg_price,
                                    twentyfive2twentynine_avg_price,
                                    thirty2thirtyfour_avg_price,
                                    thirtyfive2thirtynine_avg_price,
                                    over_fourty_avg_price]

bin_df['Total Revenue'] = total_revenue
bin_df['Normalized Price'] = normalized_values
bin_df = bin_df.style.format({'Average Purchase Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}','Normalized Price': '$ {:,.2f}'})
bin_df

Unnamed: 0,Purchase Count,Average Purchase Price,Total Revenue,Normalized Price
<10,28,$ 2.98,$ 83.46,$ 4.39
10-14,35,$ 2.77,$ 96.95,$ 4.22
15-19,133,$ 2.91,$ 386.42,$ 3.86
20-24,336,$ 2.91,$ 978.77,$ 3.78
25-29,125,$ 2.96,$ 370.33,$ 4.26
30-34,64,$ 3.08,$ 197.25,$ 4.20
35-39,42,$ 2.84,$ 119.40,$ 4.42
+40,17,$ 3.16,$ 53.75,$ 4.89


# Top Spenders

In [69]:
# Find and sort by total revenue
df_group_name = df.groupby(by='SN')
df_by_spending = pd.DataFrame(df_group_name['Price'].sum().sort_values(ascending=False))
df_by_spending.rename(columns={'Price':'Total Revenue'},inplace=True)
df_by_spending.reset_index(inplace=True)

# Find purchase count of top five
first_spender_purchase_count = df_group_name['Price'].count()[df_by_spending['SN'][0]]
second_spender_purchase_count = df_group_name['Price'].count()[df_by_spending['SN'][1]]
third_spender_purchase_count = df_group_name['Price'].count()[df_by_spending['SN'][2]]
fourth_spender_purchase_count = df_group_name['Price'].count()[df_by_spending['SN'][3]]
fifth_spender_purchase_count = df_group_name['Price'].count()[df_by_spending['SN'][4]]

# Find Average purchase price of top five
first_spender_avg_purchase = df_group_name['Price'].mean()[df_by_spending['SN'][0]]
second_spender_avg_purchase = df_group_name['Price'].mean()[df_by_spending['SN'][1]]
third_spender_avg_purchase = df_group_name['Price'].mean()[df_by_spending['SN'][2]]
fourth_spender_avg_purchase = df_group_name['Price'].mean()[df_by_spending['SN'][3]]
fifth_spender_avg_purchase = df_group_name['Price'].mean()[df_by_spending['SN'][4]]

# Compiling analysis into a single table
top_spenders_df = pd.DataFrame(df_by_spending[:5])
top_spenders_df['Purchase Count'] = [first_spender_purchase_count,
                                    second_spender_purchase_count,
                                    third_spender_purchase_count,
                                    fourth_spender_purchase_count,
                                    fifth_spender_purchase_count]
top_spenders_df['Average Purchase Price'] = [first_spender_avg_purchase,
                                            second_spender_avg_purchase,
                                            third_spender_avg_purchase,
                                            fourth_spender_avg_purchase,
                                            fifth_spender_avg_purchase]
top_spenders_df = top_spenders_df.style.format({'Average Purchase Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}'})
top_spenders_df

Unnamed: 0,SN,Total Revenue,Purchase Count,Average Purchase Price
0,Undirrala66,$ 17.06,5,$ 3.41
1,Saedue76,$ 13.56,4,$ 3.39
2,Mindimnya67,$ 12.74,4,$ 3.18
3,Haellysu29,$ 12.73,3,$ 4.24
4,Eoda93,$ 11.58,3,$ 3.86


# Most Popular Items

In [70]:
df_group_itemname = df.groupby(by='Item Name')
df_by_pop_items = pd.DataFrame(df_group_itemname['Price'].count().sort_values(ascending=False))
df_by_pop_items.rename(columns={'Price':'Purchase Count'},inplace=True)
df_by_pop_items.reset_index(inplace=True)

# Find Item IDs and Price
most_pop_itemNames = df_by_pop_items['Item Name'][0:5]
most_pop_itemIDs = list()
most_pop_itemPrice = list()
for item in most_pop_itemNames:
    for item_iter in range(len(df['Item Name'])):
        if df['Item Name'][item_iter]==item:
            most_pop_itemIDs.append(df['Item ID'][item_iter])
            most_pop_itemPrice.append(df['Price'][item_iter])
            break

# Find Total Purchase Value
most_pop_totalPurchaseValue = list()
for i in range(5):
    count = df_by_pop_items['Purchase Count'][i]
    most_pop_totalPurchaseValue.append(count*most_pop_itemPrice[i])

# Compiling analysis into a single table
pop_items_df = pd.DataFrame(df_by_pop_items[:5])
pop_items_df['Item ID'] = most_pop_itemIDs
pop_items_df['Item Price'] = most_pop_itemPrice
pop_items_df['Total Revenue'] = most_pop_totalPurchaseValue

pop_items_df = pop_items_df.style.format({'Item Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}'})
pop_items_df

Unnamed: 0,Item Name,Purchase Count,Item ID,Item Price,Total Revenue
0,Final Critic,14,92,$ 1.36,$ 19.04
1,"Betrayal, Whisper of Grieving Widows",11,39,$ 2.35,$ 25.85
2,Arcane Gem,11,84,$ 2.23,$ 24.53
3,Stormcaller,10,30,$ 4.15,$ 41.50
4,Woeful Adamantite Claymore,9,175,$ 1.24,$ 11.16


# Most Profitable Items

In [71]:
# Only after reaching this point do I realize that I should have collected item price for all items in the last section
# Therefore this code can be further optimized
df_group_itemname = df.groupby(by='Item Name')
df_by_pop_items = pd.DataFrame(df_group_itemname['Price'].count().sort_values(ascending=False))
df_by_pop_items.rename(columns={'Price':'Purchase Count'},inplace=True)
df_by_pop_items.reset_index(inplace=True)

item_IDs = list()
item_value = list()
for item in df_by_pop_items['Item Name']:
    for item_iter in range(len(df['Item Name'])):
        if df['Item Name'][item_iter]==item:
            item_IDs.append(df['Item ID'][item_iter])
            item_value.append(df['Price'][item_iter])
            break
            
df_by_pop_items['Item ID'] = item_IDs
df_by_pop_items['Price']=item_value
df_by_pop_items['Total Revenue']=df_by_pop_items['Price']*df_by_pop_items['Purchase Count']
df_by_pop_items.sort_values(by='Total Revenue',ascending=False,inplace=True)
df_by_pop_items.reset_index(inplace=True)
df_by_pop_items.drop('index',axis=1,inplace=True)

# Displaying analysis in a single table, after saving to a new df as above
most_profitable_df = pd.DataFrame(df_by_pop_items[:5])
most_profitable_df = most_profitable_df.style.format({'Price': '$ {:,.2f}','Total Revenue': '$ {:,.2f}'})
most_profitable_df

Unnamed: 0,Item Name,Purchase Count,Item ID,Price,Total Revenue
0,Stormcaller,10,30,$ 4.15,$ 41.50
1,Retribution Axe,9,34,$ 4.14,$ 37.26
2,Spectral Diamond Doomblade,7,115,$ 4.25,$ 29.75
3,Orenmir,6,32,$ 4.95,$ 29.70
4,Singed Scalpel,6,103,$ 4.87,$ 29.22
