In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# Save path to data source as variable
data_filename = "Resources/purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
purchase_data = pd.read_csv(data_filename)

In [2]:
#Verify successful import by showing head of data set
purchase_data.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [3]:
#PURCHASE ANALYSIS
#Number of players (Who have made at least one purchase)
print(f'The number of players who have made at least one purchase is {len(purchase_data.SN.unique())}')

#Number of unique items
print(f'The number of unique items among those purchased is {len(purchase_data["Item ID"].unique())}')

#NAverage Purchase Price
print(f'The average purchase is ${round(np.mean(purchase_data["Price"]),2)}')

#Number of Purchases
print(f'The number of purchases is {len(purchase_data)}')

#Total Revenue
print(f'The total revenue from purchases is {round(sum(purchase_data["Price"]),2)}')

The number of players who have made at least one purchase is 576
The number of unique items among those purchased is 179
The average purchase is $3.05
The number of purchases is 780
The total revenue from purchases is 2379.77


In [4]:
#Checking for range of Gender values
purchase_data["Gender"].value_counts()

Male                     652
Female                   113
Other / Non-Disclosed     15
Name: Gender, dtype: int64

### Gender Demographics

* Percentage and Count of Male Players
* Percentage and Count of Female Players
* Percentage and Count of Other / Non-Disclosed

In [5]:
#GENDER DEMOGRAPHICS
maleplayers_raw = purchase_data[purchase_data['Gender']=="Male"]
femplayers_raw = purchase_data[purchase_data['Gender']=="Female"]
nonbyplayers_raw = purchase_data[purchase_data['Gender']=="Other / Non-Disclosed"]
players = purchase_data.groupby("SN")
player_count = len(players)
players.head()
#Male
maleplayers_count = len(maleplayers_raw["SN"].unique())
print(f'The number of players who are male is {maleplayers_count}, which is {round(100*(maleplayers_count/player_count),2)}% of the playerbase')
#Female
femplayers_count = len(femplayers_raw["SN"].unique())
print(f'The number of players who are female is {femplayers_count}, which is {round(100*(femplayers_count/player_count),2)}% of the playerbase')
nonbyplayers_count = len(nonbyplayers_raw["SN"].unique())
print(f'The number of players who are Nonbinary or have not disclosed their gender is {nonbyplayers_count}, which is {round(100*(nonbyplayers_count/player_count),2)}% of the playerbase')

The number of players who are male is 484, which is 84.03% of the playerbase
The number of players who are female is 81, which is 14.06% of the playerbase
The number of players who are Nonbinary or have not disclosed their gender is 11, which is 1.91% of the playerbase


### Purchasing Analysis (Gender)

* The below, each broken by gender
  * Purchase Count
  * Average Purchase Price
  * Total Purchase Value
  * Average Purchase Total per Person by Gender

In [6]:
genders = ["Male","Female","Other / Non-Disclosed"]
dct = {}
dct["Gender"] = []
dct["Purchase Count"] = []
dct["Average Purchase"] = []
dct["Total Purchases"] = []
dct["Average Total Purchases"] = []
for gender in genders:
    
    dct["Gender"].append(gender)
    dct["Purchase Count"].append(len(purchase_data[purchase_data["Gender"]==gender]))
    dct["Average Purchase"].append(round(np.mean(purchase_data[purchase_data["Gender"]==gender]["Price"]),2))
    dct["Total Purchases"].append(round(np.sum(purchase_data[purchase_data["Gender"]==gender]["Price"]),2))
    grouped_by_SN = purchase_data[purchase_data["Gender"]==gender].groupby("SN").sum()
    dct["Average Total Purchases"].append(round(np.mean(grouped_by_SN["Price"]),2))

pd.DataFrame(dct).set_index("Gender")

Unnamed: 0_level_0,Purchase Count,Average Purchase,Total Purchases,Average Total Purchases
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,652,3.02,1967.64,4.07
Female,113,3.2,361.94,4.47
Other / Non-Disclosed,15,3.35,50.19,4.56


Analysis:
    
    It would appear that this game is significantly more popular with men than it is with women and those who did not disclose their gender or are nonbinary. Paradoxically, the smaller the number of players from a group, the more they spend, slightly, on average, both per purchase and in total.

### Age Demographics

* The below, each broken into bins of 4 years (i.e. &lt;10, 10-14, 15-19, etc.)
  * Purchase Count
  * Average Purchase Price
  * Total Purchase Value
  * Average Purchase Total per Person by Age Group

In [7]:
purchase_data["Age Category"] = (purchase_data["Age"]//4)*4
age_cat = {}
for x in range(1,12):
    age_cat[(x*4)] = f"Ages {x*4} to {(x*4)+3} years"
purchase_data["Age Category"] = purchase_data["Age Category"].map(age_cat)

In [8]:
ages = age_cat.values()
dct = {}
dct["Age Category"] = []
dct["Purchase Count"] = []
dct["Average Purchase"] = []
dct["Total Purchases"] = []
dct["Average Total Purchases"] = []
for agecat in age_cat.values():
    
    dct["Age Category"].append(agecat)
    dct["Purchase Count"].append(len(purchase_data[purchase_data["Age Category"]==agecat]))
    dct["Average Purchase"].append(round(np.mean(purchase_data[purchase_data["Age Category"]==agecat]["Price"]),2))
    dct["Total Purchases"].append(round(np.sum(purchase_data[purchase_data["Age Category"]==agecat]["Price"]),2))
    grouped_by_SN = purchase_data[purchase_data["Age Category"]==agecat].groupby("SN").sum()
    dct["Average Total Purchases"].append(round(np.mean(grouped_by_SN["Price"]),2))

pd.DataFrame(dct).set_index("Age Category")

Unnamed: 0_level_0,Purchase Count,Average Purchase,Total Purchases,Average Total Purchases
Age Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ages 4 to 7 years,9,3.65,32.89,4.7
Ages 8 to 11 years,30,3.16,94.86,4.12
Ages 12 to 15 years,47,2.93,137.81,3.94
Ages 16 to 19 years,101,3.04,307.24,3.79
Ages 20 to 23 years,298,3.03,903.84,4.3
Ages 24 to 27 years,150,3.06,459.54,4.14
Ages 28 to 31 years,60,2.97,178.05,4.05
Ages 32 to 35 years,45,2.93,131.66,4.11
Ages 36 to 39 years,27,3.54,95.64,4.55
Ages 40 to 43 years,10,3.12,31.18,3.46


Analysis:
    
    Overall, there appears to be a general negative trend between age and purchasing. The older a player is, the less likely they are to make purchases and the smaller those purchases are, on average. The correlation is not as strong as with gender, though. Also, I have significant concerns about a company that is willing to allow 7-year-old players to purchase ingame digital assets.

### Top Spenders

* Identify the the top 5 spenders in the game by total purchase value, then list (in a table):
  * SN
  * Purchase Count
  * Average Purchase Price
  * Total Purchase Value

In [9]:
top_5 = list(purchase_data.groupby("SN").sum().sort_values("Price", ascending=False).head().reset_index()["SN"])
dct = {}
dct["SN"] = []
dct["Purchase Count"] = []
dct["Average Purchase"] = []
dct["Total Purchases"] = []
#Haha, I snuck political commentary into my code by referencing "Whales" terminology, which is used by predatory developers
for whale in top_5:
    dct["SN"].append(whale)
    dct["Purchase Count"].append(len(purchase_data[purchase_data["SN"]==whale]))
    dct["Average Purchase"].append(round(np.mean(purchase_data[purchase_data["SN"]==whale]["Price"]),2))
    dct["Total Purchases"].append(round(np.sum(purchase_data[purchase_data["SN"]==whale]["Price"]),2))
pd.DataFrame(dct).set_index("SN")

Unnamed: 0_level_0,Purchase Count,Average Purchase,Total Purchases
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lisosia93,5,3.79,18.96
Idastidru52,4,3.86,15.45
Chamjask73,3,4.61,13.83
Iral74,4,3.4,13.62
Iskadarya95,3,4.37,13.1


### Most Profitable Items

* Identify the 5 most profitable items by total purchase value, then list (in a table):
  * Item ID
  * Item Name
  * Purchase Count
  * Item Price
  * Total Purchase Value

In [10]:
itemsdict = {}
for index, row in purchase_data.iterrows():
    if row["Item ID"] not in itemsdict.keys():
        itemsdict[row["Item ID"]] = [row["Item Name"],row["Price"]]

top_items = list(purchase_data.groupby("Item ID").count().sort_values("Price", ascending=False).head().reset_index()["Item ID"])
dct = {}
dct["Item ID"] = []
dct["Item Name"] = []
dct["Purchase Count"] = []
dct["Item Price"] = []
dct["Total Purchases"] = []
for loot in top_items:
    dct["Item ID"].append(loot)
    dct["Item Name"].append(itemsdict[loot][0])
    dct["Purchase Count"].append(len(purchase_data[purchase_data["Item ID"]==loot]))
    dct["Item Price"].append(itemsdict[loot][1])
    dct["Total Purchases"].append(round(np.sum(purchase_data[purchase_data["Item ID"]==loot]["Price"]),2))
pd.DataFrame(dct).set_index("Item ID")

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchases
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,Final Critic,13,4.88,59.99
178,"Oathbreaker, Last Hope of the Breaking Storm",12,4.23,50.76
145,Fiery Glass Crusader,9,4.58,41.22
132,Persuasion,9,3.19,28.99
108,"Extraction, Quickblade Of Trembling Hands",9,3.53,31.77


Analysis:
    
    Among the top 5 items, there appears to be a positive correlation between price and popularity, resulting in a significantly higher profit for the more expensive items. If the data on the top 5 items were taken without any other considerations, then the optimal strategy would be to make the most expensive single items possible. Or maybe see if combo packages would further increase this effect - larger, single transactions.