In [47]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
file_to_load = "purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
purchase_data = pd.read_csv(file_to_load)
purchase_data.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [48]:
#Get the total number of players
purchase_data = purchase_data.rename(columns = {"SN": "Player"})
purchase_data["Player"].nunique()

576

In [49]:
#get basic calculation values for reference
purchase_data.describe()

Unnamed: 0,Purchase ID,Age,Item ID,Price
count,780.0,780.0,780.0,780.0
mean,389.5,22.714103,92.114103,3.050987
std,225.310896,6.659444,52.775943,1.169549
min,0.0,7.0,0.0,1.0
25%,194.75,20.0,48.0,1.98
50%,389.5,22.0,93.0,3.15
75%,584.25,25.0,139.0,4.08
max,779.0,45.0,183.0,4.99


In [50]:
#Get unique values for all colums for reference
purchase_data.nunique()

Purchase ID    780
Player         576
Age             39
Gender           3
Item ID        183
Item Name      179
Price          145
dtype: int64

In [51]:
#Obtain total revenue
total_revenue = purchase_data["Price"].sum()
total_revenue

2379.77

In [52]:
#Obtain average price
average_price = purchase_data["Price"].mean()
average_price

3.050987179487176

In [53]:
#Obtain number of purchases
purchases_num = purchase_data["Purchase ID"].count()
purchases_num

780

In [54]:
#Obtain number of unique items - based on the unique count obtained before I noticed that the count of item ID and 
#item name does not match, therefore there are multiple IDs assigned to a single item, hence we use item name to get unique number
unique_items = purchase_data["Item Name"].nunique()
unique_items

179

In [102]:
#Create sumary dataframe
sumary_df = pd.DataFrame({"Number of Unique Items": [unique_items], "Average Price": [average_price],
                         "Number of Purchases": [purchases_num], "Total Revenue": [total_revenue]})
#Change the format of te price and revenue fields to currency
sumary_df["Average Price"] = sumary_df["Average Price"].astype(float).map("${:,.2f}".format)
sumary_df["Total Revenue"] = sumary_df["Total Revenue"].astype(float).map("${:,.2f}".format)

sumary_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


In [57]:
#Obtain Gender demographics
male_players = purchase_data[purchase_data["Gender"] == "Male"].count()["Gender"]
male_players_per = (male_players / purchase_data["Gender"].count()) * 100

female_players = purchase_data[purchase_data["Gender"] == "Female"].count()["Gender"]
female_players_per = (female_players / purchase_data["Gender"].count()) * 100

other_player = purchase_data["Gender"].count() - female_players - male_players
other_player_per = (other_player / purchase_data["Gender"].count()) * 100

#show results in a new dataframe
data = np.array([["", "Total Count", "Percentage of Players"], ["Male", male_players, male_players_per],
                ["Female", female_players, female_players_per], ["Other / Non-Disclosed", other_player, other_player_per]])
geneder_sum_df = pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:])
#Formating Percentage of Players
geneder_sum_df["Percentage of Players"] = geneder_sum_df["Percentage of Players"].astype(float).map("{:,.2f}%".format)
geneder_sum_df

Unnamed: 0,Total Count,Percentage of Players
Male,652,83.59%
Female,113,14.49%
Other / Non-Disclosed,15,1.92%


In [58]:
#get sum of purchase price by gender
total_fem_pp = purchase_data[purchase_data["Gender"] == "Female"].sum()["Price"]
total_male_pp = purchase_data[purchase_data["Gender"] == "Male"].sum()["Price"]
total_op_pp = purchase_data["Price"].sum() - total_fem_pp - total_male_pp

# calculate average
fem_pur_avg = purchase_data[purchase_data["Gender"] == "Female"].mean()["Price"]
male_pur_avg =  purchase_data[purchase_data["Gender"] == "Male"].mean()["Price"]
op_pur_avg = purchase_data[purchase_data["Gender"] == "Other / Non-Disclosed"].mean()["Price"]

# calculate average per person 
fem_tot_avg = total_fem_pp / female_players
male_tot_avg = total_male_pp / male_players
op_tot_avg = total_op_pp / other_player

#Create a data frame with the required data
data_by_gender = np.array([["", "Purchase Count", "Average Purchase Price", 
                            "Total Purchase Value", "Avg Total Purchase per Person"], 
                           ["Male", male_players, male_pur_avg, total_male_pp, male_tot_avg],
                           ["Female", female_players, fem_pur_avg, total_fem_pp, fem_tot_avg],
                           ["Other / Non-Disclosed", other_player, op_pur_avg, total_op_pp, op_tot_avg]])
#data_by_gender[0,1:]
geneder_price_df = pd.DataFrame(data=data_by_gender[1:,1:],
                 index=data_by_gender[1:,0],
                 columns=data_by_gender[0,1:])

#Formating to currency
geneder_price_df["Average Purchase Price"] = geneder_price_df["Average Purchase Price"].astype(float).map("${:,.2f}".format)
geneder_price_df["Total Purchase Value"] = geneder_price_df["Total Purchase Value"].astype(float).map("${:,.2f}".format)
geneder_price_df["Avg Total Purchase per Person"] = geneder_price_df["Avg Total Purchase per Person"].astype(float).map("${:,.2f}".format)
geneder_price_df

Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Male,652,$3.02,"$1,967.64",$3.02
Female,113,$3.20,$361.94,$3.20
Other / Non-Disclosed,15,$3.35,$50.19,$3.35


In [74]:
# Create the bins in which Data will be held
bins = [0, 9, 14, 19, 24, 29, 34, 39, 40]

# Create the names for the four bins
group_names = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]

#grouping by age range
purchase_data["Age_range"] = pd.cut(purchase_data["Age"], bins, labels=group_names)
age_group = purchase_data.groupby("Age_range")

#create new dataframe with purchase sum and purchase average price
age_df = age_group["Price"].agg([np.sum, np.mean])
age_df["Purchase count"] = age_group["Player"].count()

#renaming colum sum and mean
age_df = age_df.rename(columns = {"sum": "Total Purchase Value", "mean": "Average Purchase Price"})

#formating to currency
#age_df["Total Purchase Value"] = geneder_price_df["Total Purchase Value"].astype(float).map("${:,.2f}".format)
#age_df["Average Purchase Price"] = geneder_price_df["Average Purchase Price"].astype(float).map("${:,.2f}".format)
age_df

Unnamed: 0_level_0,Total Purchase Value,Average Purchase Price,Purchase count
Age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<10,77.13,3.353478,23
10-14,82.78,2.956429,28
15-19,412.89,3.035956,136
20-24,1114.06,3.052219,365
25-29,293.0,2.90099,101
30-34,214.0,2.931507,73
35-39,147.67,3.601707,41
40+,16.71,2.785,6


In [76]:
#grouping by player to obtain purchace details for each player
player_group = purchase_data.groupby("Player")
#getting total purchase and mean for player
palyer_df = player_group["Price"].agg([np.sum, np.mean])

#adding purchase count by player
palyer_df["Purchase count"] = player_group["Player"].count()
palyer_df = palyer_df.rename(columns = {"sum": "Total Purchase Value", "mean": "Average Purchase Price"})

#Sorting by total purchase to get top 5 spender
palyer_df.sort_values(by="Total Purchase Value", ascending=False).head()


Unnamed: 0_level_0,Total Purchase Value,Average Purchase Price,Purchase count
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lisosia93,18.96,3.792,5
Idastidru52,15.45,3.8625,4
Chamjask73,13.83,4.61,3
Iral74,13.62,3.405,4
Iskadarya95,13.1,4.366667,3


In [100]:
#grouping by item to get most profitable item
item_group = purchase_data.groupby("Item Name")
#getting the total purchase value
item_df = item_group["Price"].agg([np.sum])
item_df = item_df.rename(columns = {"sum": "Total Purchase Value"})

item_df["Purchase count"] = item_group["Item Name"].count()
item_df["Item ID"] = item_group["Item ID"].max()

item_df.sort_values(by="Total Purchase Value", ascending = False).head()

Unnamed: 0_level_0,Total Purchase Value,Purchase count,Item ID
Item Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Final Critic,59.99,13,101
"Oathbreaker, Last Hope of the Breaking Storm",50.76,12,178
Nirvana,44.1,9,82
Fiery Glass Crusader,41.22,9,145
Singed Scalpel,34.8,8,103


In [None]:
#Data conclusions
# Heroes of Pymoli
# 1 - Players age range from 20-24 years old is the group who did more purchases and spent more, 365 purchases worth $1,114.06 USD, which represents 46% of the total sales
# 2 - the top 5 spenders spent between $13 - $18 USD with average purchase price between $3.4 - $4.6
# 3 - 83.59% of the players are male, while 14.49% are female.

