# Heroes Of Pymoli Data Analysis<br>
### Observed Trends:<br>
1) Male players outnumber female by almost 5:1.<br>
2) About 40 percent of players are between the ages of 20 and 24. 75 percent fall between ages 15 and 29.<br> 
3) While 20-24 year olds made the most purchases, those over age 40 spend more per capita than any other group.<br>
4) The "Hatred" item was both a best seller and made the most revenue. 

In [1]:
#Dependencies
import pandas as pd

In [2]:
#read json file of dictionary items
#json_path = "raw_data/purchase_data.json"
#p_data_df = pd.read_json(json_path)

#read CSV file
path = "raw_data/purchase_data_3.csv"
p_data_df = pd.read_csv(path)

p_data_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lirtjaskan85,24,Female,69,"Frenzy, Defender of the Harvest",4.82
1,1,Chanjask65,12,Female,75,Brutality Ivory Warmace,4.12
2,2,Aerithllora36,21,Female,114,Yearning Mageblade,2.67
3,3,Aeralria27,24,Male,130,Alpha,4.53
4,4,Haisrisuir60,28,Male,9,"Thorn, Conqueror of the Corrupted",4.6


### Player Count

In [3]:
unique_player_count_s = pd.Series(['Total Players',len(p_data_df['SN'].unique())])

#keep track of number of unique players for later calculations
unique_players = len(p_data_df['SN'].unique())

#output number of unique players
unique_player_count_s


0    Total Players
1              581
dtype: object

### Purchasing Analysis (Total)

In [4]:
#total number of purchases
total_purchases = p_data_df['SN'].count()
stats = p_data_df.describe()

price_mean = stats.iloc[1,3]
price_mean
item_list = p_data_df['Item ID'].value_counts()
unique_item_count = item_list.count()
total_revenue = p_data_df['Price'].sum()


In [5]:
purch_analysis = pd.DataFrame({'Number of Unique Items': [unique_item_count],
                              'Average Purchase Price': [price_mean],
                             'Total Number of Purchases':[total_purchases],
                              'Total Revenue':[total_revenue]})
#put the columns in the right order
purch_analysis = purch_analysis[["Number of Unique Items",
                              "Average Purchase Price",
                                 "Total Number of Purchases",
                                 "Total Revenue"]]
#add the formatting

purch_analysis['Average Purchase Price'] = purch_analysis['Average Purchase Price'].map('${:,.2f}'.format)
purch_analysis['Total Revenue'] = purch_analysis['Total Revenue'].map('${:,.2f}'.format)
purch_analysis



Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,181,$3.03,780,"$2,365.17"


### Gender Demographics

In [6]:
#need to first get a table of unique players, and then group by gender
unique_players_df = p_data_df[['Gender','SN','Age']]
#remove duplicate SN's
unique_players_df = unique_players_df.drop_duplicates(subset=['SN'])

In [7]:
gender_grouped = unique_players_df.groupby(['Gender'])
gender_demo = pd.DataFrame(gender_grouped['Gender'].count())
gender_demo = gender_demo.rename(columns={'Gender':'Total Count'})
gender_demo = gender_demo.reset_index()

#compute the percentage
gender_demo['Percentage of Players'] =   round((gender_demo["Total Count"]/unique_players) * 100 ,2)
gender_demo.set_index('Gender',inplace=True)
gender_demo



Unnamed: 0_level_0,Total Count,Percentage of Players
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,99,17.04
Male,475,81.76
Other / Non-Disclosed,7,1.2


### Purchasing Analysis (Gender)

In [8]:
#now go back to the original data to get the price info
gender_grouped = p_data_df.groupby(['Gender'])
gender_grouped['Price'].mean()


Gender
Female                   3.044615
Male                     3.030389
Other / Non-Disclosed    2.982500
Name: Price, dtype: float64

In [9]:
#create data frame for purchasing analysis, and put in the count of the purchases (by gender)
purch_gender_df = pd.DataFrame(gender_grouped['Price'].count())
purch_gender_df = purch_gender_df.rename(columns={"Price":"Purchase Count"})

#add the mean and the sum
purch_gender_df['Average Purchase Price'] = gender_grouped['Price'].mean()
purch_gender_df['Total Purchase Value'] = gender_grouped['Price'].sum()
purch_gender_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,130,3.044615,395.8
Male,642,3.030389,1945.51
Other / Non-Disclosed,8,2.9825,23.86


In [10]:
#add the Normalized Totals 

#Try this for normalizing the average:
#Total Purchase Value / number of unique people in the category

purch_gender_df['Normalized Averages']= purch_gender_df['Total Purchase Value'] / gender_demo['Total Count']

#add formatting
purch_gender_df['Average Purchase Price'] = purch_gender_df['Average Purchase Price'].map('${:,.2f}'.format)
purch_gender_df['Total Purchase Value'] = purch_gender_df['Total Purchase Value'].map('${:,.2f}'.format)
purch_gender_df['Normalized Averages']= purch_gender_df['Normalized Averages'].map('${:,.2f}'.format)

purch_gender_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Averages
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,130,$3.04,$395.80,$4.00
Male,642,$3.03,"$1,945.51",$4.10
Other / Non-Disclosed,8,$2.98,$23.86,$3.41


### Age Demographics

In [11]:
#Break into bins of 4 years (percentage of players, total count)
# and then in separate output: (purchase count, avg purch price, tot purch value, normalized)
bins = [0,10,14,19,24,29,34,39,100]
group_names = ['<10','10-14','15-19','20-24','25-29','30-34','35-39','40+']
index_names = [1,2,3,4,5,6,7,8]

#add the bin to the list of UNIQUE players
unique_players_df['age group'] = pd.cut(unique_players_df['Age'], bins, labels=index_names)
#group by the bins
age_groups = unique_players_df.groupby("age group")
#create output with total
age_demog = pd.DataFrame(age_groups['Age'].count())


In [12]:
#rename group to Total Count, change the age to the actual group names and add a column for percentage
age_demog = age_demog.rename(columns={"Age":"Total Count"})

age_demog.reset_index(inplace=True)
age_demog['age group'] = age_demog['age group'].replace(index_names, group_names)
age_demog['Percentage of Players'] = round((age_demog['Total Count']/unique_players) * 100,2)

#set the index to be the age group
age_demog.set_index('age group',inplace=True)
age_demog

Unnamed: 0_level_0,Total Count,Percentage of Players
age group,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,28,4.82
10-14,19,3.27
15-19,103,17.73
20-24,230,39.59
25-29,105,18.07
30-34,47,8.09
35-39,41,7.06
40+,8,1.38


### Purchasing Analysis (Age)

In [13]:
#add the bins to the original data read
p_data_df['age group'] = pd.cut(p_data_df['Age'], bins, labels=index_names)
#group by the bins
age_groups = p_data_df.groupby("age group")


In [14]:
#create data frame 
purch_analysis = pd.DataFrame(age_groups['Item ID'].count())

#add column with avg purchase price
purch_analysis['Average Purchase Price'] = age_groups['Price'].mean()

#add total purchase value for each age group
purch_analysis['Total Purchase Value'] = age_groups['Price'].sum()

#rename column
purch_analysis = purch_analysis.rename(columns={"Item ID":"Purchase Count"})

#set the index to be the age group, and put in the group names (so they will be in the correct order)
purch_analysis.reset_index(inplace=True)
purch_analysis['age group'] = purch_analysis['age group'].replace(index_names, group_names)

purch_analysis.set_index('age group',inplace=True)
purch_analysis

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
age group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<10,39,2.922308,113.97
10-14,29,2.979655,86.41
15-19,132,3.080455,406.62
20-24,316,3.008196,950.59
25-29,137,3.045328,417.21
30-34,62,2.925806,181.4
35-39,54,3.175926,171.5
40+,11,3.406364,37.47


In [15]:
#add the normalized averages
#Try this for normalizing the average:
#Total Purchase Value / number of unique people in the category
#    this appears to be average spending per capita
purch_analysis['Normalized Averages']= purch_analysis['Total Purchase Value'] / age_demog['Total Count']

#add formatting
purch_analysis['Average Purchase Price'] = purch_analysis['Average Purchase Price'].map('${:,.2f}'.format)
purch_analysis['Total Purchase Value'] = purch_analysis['Total Purchase Value'].map('${:,.2f}'.format)
purch_analysis['Normalized Averages']= purch_analysis['Normalized Averages'].map('${:,.2f}'.format)

purch_analysis

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Averages
age group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,39,$2.92,$113.97,$4.07
10-14,29,$2.98,$86.41,$4.55
15-19,132,$3.08,$406.62,$3.95
20-24,316,$3.01,$950.59,$4.13
25-29,137,$3.05,$417.21,$3.97
30-34,62,$2.93,$181.40,$3.86
35-39,54,$3.18,$171.50,$4.18
40+,11,$3.41,$37.47,$4.68


### Top Spenders

In [16]:
#group by SN, and get avg price, sum of price and count of price
spenders = p_data_df.groupby(['SN'])
spenders.head()

total_spenders = pd.DataFrame(spenders['Price'].count())
total_spenders["Purchase Count"] = spenders['Price'].count()
total_spenders["Average Purchase Price"] = spenders['Price'].mean()
total_spenders["Total Purchase Value"] = spenders['Price'].sum()

#delete extra column
del total_spenders['Price']

#sort by total, descending
top_spenders = total_spenders.sort_values("Total Purchase Value", ascending=False)

#format
top_spenders['Average Purchase Price'] = top_spenders['Average Purchase Price'].map('${:,.2f}'.format)
top_spenders['Total Purchase Value'] = top_spenders['Total Purchase Value'].map('${:,.2f}'.format)

#display the top 5 spenders
top_spenders.head()

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chamalo71,4,$3.36,$13.45
Strithenu87,4,$3.21,$12.83
Mindosia50,3,$4.00,$12.01
Aeralria27,3,$3.79,$11.38
Eudai71,3,$3.79,$11.37


### Most Popular Items

In [17]:
#group by Item ID&Name, and get count, price and total value (sum)
items = p_data_df.groupby(['Item ID','Item Name','Price'])
items.head()
total_items = pd.DataFrame(items['Item Name'].count())

total_items['Total Purchase Value'] = items['Price'].sum()

total_items = total_items.rename(columns={'Item Name':'Purchase Count'})

total_items.reset_index(inplace=True)


#put the columns in the right order and change the index to Item ID
total_items = total_items[['Item ID','Item Name','Purchase Count','Price','Total Purchase Value']]
total_items.set_index('Item ID',inplace=True)
total_items = total_items.rename(columns={'Price':'Item Price'})

#sort to get most popular
sorted_by_purchase = total_items.sort_values("Purchase Count", ascending=False)

#sort to get most profitable (for next section), before formatting
sorted_by_revenue = total_items.sort_values("Total Purchase Value", ascending=False)

# add the formatting
sorted_by_purchase['Item Price'] = sorted_by_purchase['Item Price'].map('${:,.2f}'.format)
sorted_by_purchase['Total Purchase Value'] = sorted_by_purchase['Total Purchase Value'].map('${:,.2f}'.format)

#display the top 5
sorted_by_purchase.head()

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchase Value
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
52,Hatred,11,$4.59,$50.49
174,Primitive Blade,9,$1.39,$12.51
111,Misery's End,9,$4.90,$44.10
91,Celeste,8,$2.07,$16.56
143,Frenzied Scimitar,8,$3.35,$26.80


### Most Profitable Items

In [19]:
#sort to get "most profitable"
# This is really the items with the highest revenue. Don't really know if they are most profitable 
#     because we don't know the expenses associated with them. 

sorted_by_revenue = total_items.sort_values("Total Purchase Value", ascending=False)

# add the formatting
sorted_by_revenue['Item Price'] = sorted_by_revenue['Item Price'].map('${:,.2f}'.format)
sorted_by_revenue['Total Purchase Value'] = sorted_by_revenue['Total Purchase Value'].map('${:,.2f}'.format)

#display the top 5
sorted_by_revenue.head()

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchase Value
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
52,Hatred,11,$4.59,$50.49
111,Misery's End,9,$4.90,$44.10
120,Agatha,8,$4.93,$39.44
93,Apocalyptic Battlescythe,8,$4.85,$38.80
49,"The Oculus, Token of Lost Worlds",8,$4.61,$36.88
