**Pandas Homework - Option 1: Heroes of Pymoli** 

Congratulations! After a lot of hard work in the data munging mines, you've landed a job as Lead Analyst for an independent gaming company. You've been assigned the task of analyzing the data for their most recent fantasy game Heroes of Pymoli. 

Like many others in its genre, the game is free-to-play, but players are encouraged to purchase optional items that enhance their playing experience. As a first task, the company would like you to generate a report that breaks down the game's purchasing data into meaningful insights.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#import json file to dataframe
json_file_path = 'purchase_data.json'

df = pd.read_json(json_file_path)

In [3]:
df.head()

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46
2,34,Male,174,Primitive Blade,2.46,Assastnya25
3,21,Male,92,Final Critic,1.36,Pheusrical25
4,23,Male,63,Stormfury Mace,1.27,Aela59


In [4]:
df.shape

(780, 6)

In [5]:
sorted(df.Price) #check out the purchases made; make sure free items of 0.00 aren't included

[1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.03,
 1.06,
 1.06,
 1.06,
 1.06,
 1.06,
 1.11,
 1.11,
 1.11,
 1.11,
 1.1400000000000001,
 1.1400000000000001,
 1.1400000000000001,
 1.1400000000000001,
 1.1400000000000001,
 1.16,
 1.16,
 1.16,
 1.16,
 1.2,
 1.2,
 1.2,
 1.21,
 1.21,
 1.21,
 1.24,
 1.24,
 1.24,
 1.24,
 1.24,
 1.24,
 1.24,
 1.24,
 1.24,
 1.27,
 1.27,
 1.27,
 1.27,
 1.28,
 1.28,
 1.28,
 1.28,
 1.32,
 1.32,
 1.32,
 1.35,
 1.35,
 1.35,
 1.35,
 1.35,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3599999999999999,
 1.3900000000000001,
 1.3900000000000001,
 1.45,
 1.45,
 1.45,
 1.45,
 1.45,
 1.48,
 1.48,
 1.48,
 1.48,
 1.49,
 1.49,
 1.49,
 1.49,
 1.49,
 1.49,
 1.49,
 1.49,
 1.49,
 1.5,
 1.5,
 1.55,
 1.55,
 1.55,
 1.55,
 1.55,
 1.55,
 1.55,
 1.5

### Player Count

In [6]:
player_count = len(df.SN.unique())  #the json data is all purchases, so make sure have unique SNs
pd.DataFrame({'Total Unique Players': [player_count]})

Unnamed: 0,Total Unique Players
0,573


### Purchasing Analysis (Total)

In [7]:
num_unique_items_purchased = len(df['Item ID'].unique())
average_purchase_price = df.Price.mean()
total_num_purchases = df.Price.count()
total_revenue = df.Price.sum()
purchasing_totals_df = pd.DataFrame([[num_unique_items_purchased,
                                    average_purchase_price,
                                    total_num_purchases,
                                    total_revenue]],
                                    columns = ['Number of Unique Items',
                                                'Average Purchase Price',
                                                'Total Number of Purchases',
                                                'Total Revenue'])

purchasing_totals_df['Average Purchase Price'] = purchasing_totals_df[
                                                'Average Purchase Price'].apply('${:.2f}'.format)
purchasing_totals_df['Total Revenue'] = purchasing_totals_df[
                                                'Total Revenue'].apply('${:.2f}'.format)

purchasing_totals_df

Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,183,$2.93,780,$2286.33


### Gender Demographics

In [8]:
df.head(2)

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46


In [9]:
df.Gender.unique()

array(['Male', 'Female', 'Other / Non-Disclosed'], dtype=object)

In [10]:
df_unique_users = df.drop_duplicates(subset=['SN'])[['SN','Age','Gender']]
print(df_unique_users.head(5))
print(df_unique_users.shape)
print(df_unique_users.Gender.value_counts())

             SN  Age Gender
0     Aelalis34   38   Male
1        Eolo46   21   Male
2   Assastnya25   34   Male
3  Pheusrical25   21   Male
4        Aela59   23   Male
(573, 3)
Male                     465
Female                   100
Other / Non-Disclosed      8
Name: Gender, dtype: int64


In [11]:
gender_counts = df_unique_users.Gender.value_counts()
df_gender_counts = pd.DataFrame(gender_counts)
df_gender_counts.rename(columns = {'Gender':'Total Count'}, inplace=True)
df_gender_counts['% of Players'] = (df_gender_counts['Total Count'] / df_gender_counts['Total Count'].sum())*100
df_gender_counts['% of Players'] = df_gender_counts['% of Players'].apply('{:.2f}%'.format)
df_gender_counts = df_gender_counts[['% of Players','Total Count']]
df_gender_counts

Unnamed: 0,% of Players,Total Count
Male,81.15%,465
Female,17.45%,100
Other / Non-Disclosed,1.40%,8


In [12]:
#store the gender user counts to a dictionary
gender_counts  = gender_counts.to_dict()
gender_counts

{'Female': 100, 'Male': 465, 'Other / Non-Disclosed': 8}

In [13]:
#could also use grouby to do this
#test = pd.DataFrame(df_unique_users.groupby(['Gender'])['SN'].count())
#test

### Purchasing Analysis (Gender)

In [14]:
df.head(2)

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46


In [15]:
#gender_analysis2 = pd.DataFrame(df.groupby('Gender')['Age'].count())
#gender_analysis2.rename(columns = {'Age':'Purchase Count'}, inplace=True) 
#gender_analysis2

In [16]:
gender_analysis = df.groupby('Gender').agg({'Age': np.count_nonzero, 
                                           'Price': np.mean})
gender_analysis.rename(columns = {'Age':'Purchase Count', 'Price': 'Average Purchase Price'}, inplace=True) 

gender_analysis['Total Purchase Value'] = df.groupby('Gender')['Price'].sum()

gender_analysis

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,136,2.815515,382.91
Male,633,2.950521,1867.68
Other / Non-Disclosed,11,3.249091,35.74


In [17]:
#print(gender_analysis['Total Purchase Value'][1])
#print(gender_counts[gender_analysis.index[1]])



In [18]:
#gender_analysis['Total Purchase Value'][1] / gender_counts[gender_analysis.index[1]]

In [19]:
#len(gender_analysis['Total Purchase Value'])

In [20]:
#normalized totals is looking for average purchase total per user (instead of per purchase)
#create a list of values that can then be passed into dataframe column
num_rows_in_gender_analysis = len(gender_analysis['Total Purchase Value'])
gender_norm_tot = [gender_analysis['Total Purchase Value'][row] / #lookup total purchase value
                 gender_counts[gender_analysis.index[row]] #lookup the user count based on gender index
                 for row in range(0, num_rows_in_gender_analysis)]

gender_analysis['Normalized Totals'] = gender_norm_tot

gender_analysis[['Average Purchase Price', 'Total Purchase Value', 'Normalized Totals']]=gender_analysis[
                ['Average Purchase Price', 'Total Purchase Value', 'Normalized Totals']].applymap(
                                                                                '${:.2f}'.format)
gender_analysis

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,136,$2.82,$382.91,$3.83
Male,633,$2.95,$1867.68,$4.02
Other / Non-Disclosed,11,$3.25,$35.74,$4.47


### Age Demographics

In [21]:
df_unique_users.head()

Unnamed: 0,SN,Age,Gender
0,Aelalis34,38,Male
1,Eolo46,21,Male
2,Assastnya25,34,Male
3,Pheusrical25,21,Male
4,Aela59,23,Male


In [46]:
print(df_unique_users.Age.min())
print(df_unique_users.Age.max())

7
45


In [94]:
min_age = df_unique_users.Age.min()
max_age = df_unique_users.Age.max()
bins = list(range(9,max_age,5))
#print(bins)
#labels = [str(bins[0]+1)+"-"+str(bins[1])]
labels = [str(bins[idx]+1)+"-"+str(bins[idx+1]) for idx in range(0,len(bins)-1)]
#print(labels)
bins.insert(0,0)
labels.insert(0,'<10')
bins.append(max_age)
labels.append(str(bins[-2]+1)+"+")
print(bins)
print(labels)

[0, 9, 14, 19, 24, 29, 34, 39, 44, 45]
['<10', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45+']


In [140]:
df_unique_users['Age Group'] = pd.cut(df_unique_users['Age'], bins=bins, labels=labels)
df_unique_users.head()

Unnamed: 0,SN,Age,Gender,Age Group
0,Aelalis34,38,Male,35-39
1,Eolo46,21,Male,20-24
2,Assastnya25,34,Male,30-34
3,Pheusrical25,21,Male,20-24
4,Aela59,23,Male,20-24


In [141]:
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
df.head()

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN,Age Group
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34,35-39
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46,20-24
2,34,Male,174,Primitive Blade,2.46,Assastnya25,30-34
3,21,Male,92,Final Critic,1.36,Pheusrical25,20-24
4,23,Male,63,Stormfury Mace,1.27,Aela59,20-24


In [142]:
age_group_counts = df_unique_users['Age Group'].value_counts(sort=False)
#print(age_group_counts)
df_age_group_counts = pd.DataFrame(age_group_counts)
df_age_group_counts.rename(columns = {'Age Group':'Total Count'}, inplace=True)
df_age_group_counts['% of Players'] = (df_age_group_counts['Total Count'] / df_age_group_counts['Total Count'].sum())*100
df_age_group_counts['% of Players'] = df_age_group_counts['% of Players'].apply('{:.2f}%'.format)
df_age_group_counts = df_age_group_counts[['% of Players','Total Count']]
df_age_group_counts

Unnamed: 0,% of Players,Total Count
<10,3.32%,19
10-14,4.01%,23
15-19,17.45%,100
20-24,45.20%,259
25-29,15.18%,87
30-34,8.20%,47
35-39,4.71%,27
40-44,1.75%,10
45+,0.17%,1


In [143]:
#store the age user counts to a dictionary
age_group_counts  = age_group_counts.to_dict()
age_group_counts

{'10-14': 23,
 '15-19': 100,
 '20-24': 259,
 '25-29': 87,
 '30-34': 47,
 '35-39': 27,
 '40-44': 10,
 '45+': 1,
 '<10': 19}

### Purchasing Analysis (Age)

In [144]:
age_analysis = df.groupby('Age Group').agg({'Age': np.count_nonzero, 
                                           'Price': np.mean})
age_analysis.rename(columns = {'Age':'Purchase Count', 'Price': 'Average Purchase Price'}, inplace=True) 

age_analysis['Total Purchase Value'] = df.groupby('Age Group')['Price'].sum()

#normalized totals is looking for average purchase total per user (instead of per purchase)
#create a list of values that can then be passed into dataframe column
num_rows_in_age_analysis = len(age_analysis['Total Purchase Value'])
age_norm_tot = [age_analysis['Total Purchase Value'][row] / #lookup total purchase value
                 age_group_counts[age_analysis.index[row]] #lookup the user count based on gender index
                 for row in range(0, num_rows_in_age_analysis)]

age_analysis['Normalized Totals'] = age_norm_tot

age_analysis[['Average Purchase Price', 'Total Purchase Value', 'Normalized Totals']]=age_analysis[
                ['Average Purchase Price', 'Total Purchase Value', 'Normalized Totals']].applymap(
                                                                                '${:.2f}'.format)
age_analysis

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,28,$2.98,$83.46,$4.39
10-14,35,$2.77,$96.95,$4.22
15-19,133,$2.91,$386.42,$3.86
20-24,336,$2.91,$978.77,$3.78
25-29,125,$2.96,$370.33,$4.26
30-34,64,$3.08,$197.25,$4.20
35-39,42,$2.84,$119.40,$4.42
40-44,16,$3.19,$51.03,$5.10
45+,1,$2.72,$2.72,$2.72


### Top Spenders

In [166]:
#from collections import Counter

In [167]:
df.head(2)

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN,Age Group
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34,35-39
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46,20-24


In [168]:
df_unique_users.head(2)

Unnamed: 0,SN,Age,Gender,Age Group
0,Aelalis34,38,Male,35-39
1,Eolo46,21,Male,20-24


In [182]:
user_spending = df.groupby('SN').agg({'Item Name': np.count_nonzero, 
                                           'Price': np.mean})
user_spending.rename(columns = {'Item Name':'Purchase Count', 'Price': 'Average Purchase Price'}, inplace=True) 

user_spending['Total Purchase Value'] = df.groupby('SN')['Price'].sum()

#get top 5 values by total purchase value before formatting changes to strings
user_spending_top5 = user_spending.nlargest(5, 'Total Purchase Value')
user_spending_top5
user_spending_top5[['Average Purchase Price', 'Total Purchase Value']]=user_spending_top5[
               ['Average Purchase Price', 'Total Purchase Value']].applymap('${:.2f}'.format)

user_spending_top5

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Undirrala66,5,$3.41,$17.06
Saedue76,4,$3.39,$13.56
Mindimnya67,4,$3.18,$12.74
Haellysu29,3,$4.24,$12.73
Eoda93,3,$3.86,$11.58


### Most Popular Items

In [195]:
sorted(df['Item ID'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183]

In [204]:
popular_items = df.groupby(['Item ID', 'Item Name']).agg({'SN': np.count_nonzero, 
                                                       'Price': np.mean})
popular_items.rename(columns = {'SN':'Purchase Count', 'Price': 'Item Price'}, inplace=True) 

popular_items['Total Purchase Value'] = df.groupby(['Item ID','Item Name'])['Price'].sum()
#popular_items

#get top 5 purchased items by purchase count before formatting changes to strings
popular_items_top5 = popular_items.nlargest(5, 'Purchase Count')
#popular_items_top5
popular_items_top5[['Item Price', 'Total Purchase Value']]=popular_items_top5[
                    ['Item Price', 'Total Purchase Value']].applymap('${:.2f}'.format)

popular_items_top5

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,"Betrayal, Whisper of Grieving Widows",11,$2.35,$25.85
84,Arcane Gem,11,$2.23,$24.53
13,Serenity,9,$1.49,$13.41
31,Trickster,9,$2.07,$18.63
34,Retribution Axe,9,$4.14,$37.26


In [216]:
#apparently a couple items in the sales database has an item name with two item ids which throws off some data aggregations
#like 'Stormcaller'
popular_item2 = df.groupby(['Item Name', 'Item ID']).count()
popular_item2.tail(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Gender,Price,SN,Age Group
Item Name,Item ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Spada, Etcher of Hatred",157,4,4,4,4,4
Spectral Bone Axe,36,3,3,3,3,3
Spectral Diamond Doomblade,115,7,7,7,7,7
Splinter,0,1,1,1,1,1
"Splitter, Foe Of Subtlety",107,8,8,8,8,8
"Storm-Weaver, Slayer of Inception",68,5,5,5,5,5
"Stormbringer, Dark Blade of Ending Misery",119,4,4,4,4,4
Stormcaller,30,5,5,5,5,5
Stormcaller,180,5,5,5,5,5
Stormfury Lantern,86,4,4,4,4,4


### Most Profitable Items

In [205]:
profitable_items = df.groupby(['Item ID', 'Item Name']).agg({'SN': np.count_nonzero, 
                                                       'Price': np.mean})
profitable_items.rename(columns = {'SN':'Purchase Count', 'Price': 'Item Price'}, inplace=True) 

profitable_items['Total Purchase Value'] = df.groupby(['Item ID','Item Name'])['Price'].sum()
#profitable_items

#get top 5 profitable items by total purchase value before formatting changes to strings
profitable_items_top5 = profitable_items.nlargest(5, 'Total Purchase Value')
#profitable_items_top5
profitable_items_top5[['Item Price', 'Total Purchase Value']]=profitable_items_top5[
                    ['Item Price', 'Total Purchase Value']].applymap('${:.2f}'.format)

profitable_items_top5

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34,Retribution Axe,9,$4.14,$37.26
115,Spectral Diamond Doomblade,7,$4.25,$29.75
32,Orenmir,6,$4.95,$29.70
103,Singed Scalpel,6,$4.87,$29.22
107,"Splitter, Foe Of Subtlety",8,$3.61,$28.88
