#### welcome to
# Heroes of Pymoji:
## The Sales Analysis

In [1]:
# dependencies
import pandas as pd
import numpy as np

In [2]:
dataFilepath = "Resources/purchase_data.csv"
dataFilepath

'Resources/purchase_data.csv'

In [3]:
# import the data in the file at dataFilepath and make a data frame of it
purchaseData_df = pd.read_csv(dataFilepath)
# purchaseData_df.head()

In [4]:
# This should be a groupby object of unique SNs:
SNs_df = purchaseData_df.groupby(["SN"])
# SNs_df.head()

In [5]:
# This makes a dataframe from SNs_df that shows each SN's
# total purchases, but which unfortunately leaves out gender
# because it can't be summed:
SNsTotalPurchases_df = pd.DataFrame(SNs_df.sum())
# SNsTotalPurchases_df

In [6]:
# This makes a dataframe from SNs_df that shows how many 
# purchases each SN made:
SNsCountPurchases_df = pd.DataFrame(SNs_df.count())
# SNsCountPurchases_df

In [7]:
# This starts with the same groupby object of unique SNs, but turns
# it into a DataFrame that retains each SN's gender:
SNsTotalByFirst_df = pd.DataFrame(SNs_df.first())
# SNsTotalByFirst_df
# I'll need both SNsTotalPurchases_df and SNsCountPurchases_df later.

In [8]:
# 1. Count unique players
PNcount = len(SNs_df)
# print(f"Total number of players: {PNcount}")

d = {"Total Players": [PNcount]}
totalPlayerCountReport_df = pd.DataFrame(data=d)
totalPlayerCountReport_df

Unnamed: 0,Total Players
0,576


In [9]:
# 2. Purchasing analysis
#    a. no. of unique items
itemCount = len(purchaseData_df["Item ID"].value_counts())

# print(f"Number of unique items: {itemCount}")

In [10]:
#    b. avg purchase price
purchaseAvg = purchaseData_df["Price"].sum() / purchaseData_df["Price"].count()
showPurchaseAvg = "$" + str(round(float(purchaseAvg), 2))

# print(f"Average purchase price: {showPurchaseAvg}")

In [11]:
#    c. total no. of purchases
purchaseCount = purchaseData_df["SN"].count()

# print(f"Total number of purchases: {purchaseCount}")

In [12]:
#    d. total revenue
totalRevenue = purchaseData_df["Price"].sum()
showTotalRevenue = "$" + str(totalRevenue)

# print(f"Total revenue: {showTotalRevenue}")

In [13]:
d = {
    "Number of Unique Items" : [itemCount],
    "Average Price" : [showPurchaseAvg],
    "Number of Purchases" : [purchaseCount],
    "Total revenue" : [showTotalRevenue]
    }
purchasingAnalysisReport_df = pd.DataFrame(data=d)
purchasingAnalysisReport_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total revenue
0,179,$3.05,780,$2379.77


In [14]:
# 3. Gender demographics

# This is a groupby object created to get total sales by gender
# (in the Price column).
SalesByGender = purchaseData_df.groupby(["Gender"])
SalesByGender_df = pd.DataFrame(SalesByGender.sum())
# SalesByGender_df
# Ignore the sums of Purchase ID, Age, and Item ID, which are meaningless.

In [15]:
# This is a groupby object of unique PNs by gender, 
# using the dataframe of unique SNs that retains gender...:
genderReport = SNsTotalByFirst_df.groupby(["Gender"])
# ...and making it a DataFrame:
genderReport_df = pd.DataFrame(genderReport.count())
# genderReport_df

In [16]:
#    a. pct and count of male players

In [17]:
# count
maleCount = int(genderReport_df.iloc[1][1])

# print(f"Number of players who identify as male: {maleCount}")

In [18]:
# percent
malePct = maleCount / PNcount
malePctShow = str(round(malePct*100,2))+"%"

# print(f"Percent of players who identify as male: {malePctShow}")

In [19]:
#    b. pct and count of female players

In [20]:
# count
femaleCount = int(genderReport_df.iloc[0][0])

# print(f"Number of players who identify as female: {femaleCount}")

In [21]:
# percent
femalePct = femaleCount / PNcount
femalePctShow = str(round(femalePct*100,2))+"%"

# print(f"Percent of players who identify as female: {femalePctShow}")

In [22]:
#    c. pct and count of other/non-disclosed gender players

In [23]:
# count
nonBinCount = int(genderReport_df.iloc[2][2])
# nonBinCount

# print(f"Number of players who do not identify as male or female\nor who did not disclose their gender: {nonBinCount}")

In [24]:
# percent
nonBinPct = nonBinCount / PNcount
nonBinPctShow = str(round(nonBinPct*100,2))+"%"

# print(f"Pct of players who do not identify as male or female\nor who did not disclose their gender: {nonBinPctShow}")

In [25]:
d = {
    "" : ["Male", "Female", "Other / Non-Disclosed"],
    "Total Count" : [maleCount, femaleCount, nonBinCount],
    "Percentage of Players" : [malePctShow, femalePctShow, nonBinPctShow],
    }
purchasingAnalysisReport_df = pd.DataFrame(data=d)
purchasingAnalysisReport_df

Unnamed: 0,Unnamed: 1,Total Count,Percentage of Players
0,Male,484,84.03%
1,Female,81,14.06%
2,Other / Non-Disclosed,11,1.91%


In [26]:
# 4. Purchasing analysis: gender
SalesCountByGender = purchaseData_df.groupby(["Gender"])
SalesCountByGender_df = pd.DataFrame(SalesCountByGender.count())
# SalesCountByGender_df

In [27]:
#    a. male players

In [28]:
#         i. purchase count
salesCountMale = SalesCountByGender_df.loc['Male','SN']
# print(f"Number of purchases made by players who identify as male: {salesCountMale}")

In [29]:
#        ii. total purchase value
salesMale = SalesByGender_df.loc['Male','Price']
salesMaleShow = "$"+str(round(float(salesMale),2))

# print(f"Total of purchases made by players who identify as male: {salesMaleShow}")

In [30]:
#       iii. avg purchase price
salesAvgMale = salesMale / salesCountMale
salesAvgMaleShow = "$"+str(round(float(salesAvgMale),2))

# print(f"Average purchase made by players who identify as male: {salesAvgMaleShow}")

In [31]:
#        iv. avg purchase total per person by gender
salesAvgPerMale = salesMale / maleCount
salesAvgPerMaleShow = "$"+str(round(float(salesAvgPerMale),2))

# print(f"Average purchase total per player who identifies as male: {salesAvgPerMaleShow}")

In [32]:
#    b. female players

In [33]:
#         i. purchase count
salesCountFemale = SalesCountByGender_df.loc['Female','SN']

# print(f"Number of purchases made by players who identify as female: {salesCountFemale}")

In [34]:
#        ii. total purchase value
salesFemale = SalesByGender_df.loc['Female','Price']
salesFemaleShow = "$"+str(round(float(salesFemale),2))

# print(f"Total of purchases made by players who identify as female: {salesFemaleShow}")

In [35]:
#       iii. avg purchase price
salesAvgFemale = salesFemale / salesCountFemale
salesAvgFemaleShow = "$"+str(round(float(salesAvgFemale),2))

# print(f"Average purchase made by players who identify as female: {salesAvgFemaleShow}")

In [36]:
#        iv. avg purchase total per person by gender
salesAvgPerFemale = salesFemale / femaleCount
salesAvgPerFemaleShow = "$"+str(round(float(salesAvgPerFemale),2))

# print(f"Average purchase total per player who identifies as female: {salesAvgPerFemaleShow}")

In [37]:
#    c. players of other/non-disclosed gender

In [38]:
#         i. purchase count
salesCountNonBin = SalesCountByGender_df.loc['Other / Non-Disclosed','SN']

# print(f"Number of purchases made by players who do not identify as male or female\nor who did not disclose their gender: {salesCountNonBin}")

In [39]:
#        ii. total purchase value
salesNonBin = SalesByGender_df.loc['Other / Non-Disclosed','Price']
salesNonBinShow = "$"+str(round(float(salesNonBin),2))


# print(f"Total of purchases made by players who do not identify as male or female\nor who did not disclose their gender: {salesNonBinShow}")

In [40]:
#       iii. avg purchase price
salesAvgNonBin = salesNonBin / salesCountNonBin
salesAvgNonBinShow = "$"+str(round(float(salesAvgNonBin),2))


# print(f"Average purchase made by players who do not identify as male or female\nor who did not disclose their gender: {salesAvgNonBinShow}")

In [41]:
#        iv. avg purchase total per person by gender
salesAvgPerNonBin = salesNonBin / nonBinCount
salesAvgPerNonBinShow = "$"+str(round(float(salesAvgPerNonBin),2))
# salesAvgPerNonBinShow


# print(f"Average purchase total per player who does not identify as male or female\nor who did not disclose their gender: {salesAvgPerNonBinShow}")

In [42]:
d = {
    "" : ["Male", "Female", "Other / Non-Disclosed"],
    "Purchase Count" : [salesCountMale, salesCountFemale, salesCountNonBin],
    "Average Purchase Price" : [salesAvgMaleShow, salesAvgFemaleShow, salesAvgNonBinShow],
    "Total Purchase Value" : [salesMaleShow, salesFemaleShow, salesNonBinShow],
    "Avg Total Purchase per Person" : [salesAvgPerMaleShow, salesAvgPerFemaleShow, salesAvgPerNonBinShow],
    }
purchasingAnalysisReport_df = pd.DataFrame(data=d)
purchasingAnalysisReport_df

Unnamed: 0,Unnamed: 1,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
0,Male,652,$3.02,$1967.64,$4.07
1,Female,113,$3.2,$361.94,$4.47
2,Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [43]:
# 5. Age demographics in bins of 4 years: <10, 10-14, 15-19, und so weiter:
#    a. purchase count
#    b. avg purchase price
#    c. total purchase value
#    d. avg purchase total per person by age group
# (This must use purchaseData_df.)

# This defines bins by age:
bins = [0, 10, 14, 19, 24, 29, 34, 39, 44]
groupNames = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]

# This will add a new column to purchaseData_df classifying each player into an age range:
purchaseData_df["AgeRange"] = pd.cut(purchaseData_df["Age"], bins, labels=groupNames, include_lowest=True)
purchaseData_df

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,AgeRange
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,20-24
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,40+
2,2,Ithergue48,24,Male,92,Final Critic,4.88,20-24
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27,20-24
4,4,Iskosia90,23,Male,131,Fury,1.44,20-24
...,...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54,20-24
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63,20-24
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46,20-24
778,778,Sisur91,7,Male,92,Final Critic,4.19,<10


In [44]:
# This gets the total count in each age range:
ageRangeTotal = purchaseData_df["AgeRange"].value_counts()
ageRangeTotal.head(8)

20-24    365
15-19    136
25-29    101
30-34     73
35-39     41
<10       32
10-14     19
40+       12
Name: AgeRange, dtype: int64

In [45]:
# calcuate what percent of the total number of players each age range represents
ageRangeAvg = round(ageRangeTotal / PNcount, 2)
ageDemographicSummary = pd.DataFrame({"Total Count": ageRangeTotal, "Percentage of Players": ageRangeAvg})

ageDemographicSummary

Unnamed: 0,Total Count,Percentage of Players
20-24,365,0.63
15-19,136,0.24
25-29,101,0.18
30-34,73,0.13
35-39,41,0.07
<10,32,0.06
10-14,19,0.03
40+,12,0.02


In [46]:
#    a. purchase count
agePurchaseCounts = purchaseData_df.groupby(["AgeRange"]).count()["Price"].rename("Purchase Count")
#    b. avg_purchase_price
ageAvgPurchasePrice= round(purchaseData_df.groupby(["AgeRange"]).mean()["Price"].rename("Avg Purchase Price"),2)
#    c. total purchase value
totalPurchaseValueAge = purchaseData_df.groupby(["AgeRange"]).sum()["Price"].rename("Total Purchase Value")
#Total Purchase Value
purchaseTotalByAge = purchaseData_df.groupby(["AgeRange"]).sum()["Price"].rename("Total Purchase Value")
#    d. avg purchase total per person by age group
avgPurchasePerson = round(purchaseTotalByAge / ageRangeTotal, 2)
agePurchasingAnalysis = pd.DataFrame({"Purchase Count": agePurchaseCounts, "Average Purchase Price": ageAvgPurchasePrice, "Total Purchase Value": totalPurchaseValueAge, "Avg Purchase Total Per Person" :avgPurchasePerson})

agePurchasingAnalysis

Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Purchase Total Per Person
<10,32,3.4,108.96,3.4
10-14,19,2.68,50.95,2.68
15-19,136,3.04,412.89,3.04
20-24,365,3.05,1114.06,3.05
25-29,101,2.9,293.0,2.9
30-34,73,2.93,214.0,2.93
35-39,41,3.6,147.67,3.6
40+,12,3.04,36.54,3.04


In [47]:
# 6. Top spenders
#    a. identify the top five spenders by total purchase value
# using the dataframe SNsTotalPurchases_df
# and turning it into a new DataFrame:
SNsTop_df = SNsTotalPurchases_df.sort_values(by="Price", ascending=False)
SNsTop_df = SNsTop_df.head()
SNsTop_df = SNsTop_df.sort_index()
# SNsTop_df
# Ignore Purchase ID, Age, and Item ID in the resulting table and only use Price.

In [48]:
# This searches SNsCountPurchases_df for the top five spenders in SNsTop,
# returns how many purchases each of them made, and adds them to SNsTop.
# Now how do I do that?

In [49]:
#    b. and list in a table:
#         i. SN
#        ii. purchase count
#       iii. avg purchase price
#        iv. total purchase value

# I'll keep working with SNsTop_df. I can add a column...
SNsTop_df["Purchase Count"] = 1
# But how to populate it?
# SNsTop_df

In [50]:
# I'd like to retrieve the name of each of the top five players from SNsTop_df,
# but it seems I need to take the index and make it a data column first:
SNsTop_df.reset_index(inplace=True)
# SNsTop_df

In [51]:
# Now I can fetch each the top spenders' SN...:
TopSpndr5 = SNsTop_df.iloc[0,0]
# TopSpndr5

In [52]:
TopSpndr4 = SNsTop_df.iloc[1,0]
# TopSpndr4

In [53]:
TopSpndr3 = SNsTop_df.iloc[2,0]
# TopSpndr3

In [54]:
TopSpndr2 = SNsTop_df.iloc[3,0]
# TopSpndr2

In [55]:
TopSpndr1 = SNsTop_df.iloc[4,0]
# TopSpndr1

In [56]:
TopSpndr5purchCt = int(SNsCountPurchases_df.loc[TopSpndr5,"Purchase ID"])
# TopSpndr5purchCt

In [57]:
# ...but how do I get the Purchase Count column to store each top spender's purchase count?

#test = SNsTop_df.iloc[1, 6]
#test

SNsTop_df.iloc[0,5] = TopSpndr5purchCt
# SNsTop_df

In [58]:
TopSpndr4purchCt = int(SNsCountPurchases_df.loc[TopSpndr4,"Purchase ID"])
SNsTop_df.iloc[1,5] = TopSpndr4purchCt

In [59]:
TopSpndr3purchCt = int(SNsCountPurchases_df.loc[TopSpndr3,"Purchase ID"])
SNsTop_df.iloc[2,5] = TopSpndr3purchCt

In [60]:
TopSpndr2purchCt = int(SNsCountPurchases_df.loc[TopSpndr2,"Purchase ID"])
SNsTop_df.iloc[3,5] = TopSpndr5purchCt

In [61]:
TopSpndr1purchCt = int(SNsCountPurchases_df.loc[TopSpndr1,"Purchase ID"])
SNsTop_df.iloc[4,5] = TopSpndr5purchCt
# SNsTop_df

In [62]:
# I know avg purchase price is just total / count but how do I do that?
SNsTop_df["Avg Purchase Price"] = 1 #value of Price column / value of Purchase Count column
# SNsTop_df

In [63]:
# Read this SN's purchase total from SNsTop_df:
TopSpndr5PurchTotal = SNsTop_df.iloc[0,4]

#Calculate this SN's average purchase:
TopSpndr5AvgPurch = TopSpndr5PurchTotal / TopSpndr5purchCt

#Write this SN's average purchase to SNsTop_df:
SNsTop_df.iloc[0,6] = round(TopSpndr5AvgPurch,2)

In [64]:
TopSpndr4PurchTotal = SNsTop_df.iloc[1,4]
TopSpndr4AvgPurch = TopSpndr4PurchTotal / TopSpndr4purchCt
SNsTop_df.iloc[1,6] = round(TopSpndr4AvgPurch,2)

In [None]:
TopSpndr3PurchTotal = SNsTop_df.iloc[2,4]
TopSpndr3AvgPurch = TopSpndr3PurchTotal / TopSpndr3purchCt
SNsTop_df.iloc[2,6] = round(TopSpndr3AvgPurch,2)

In [None]:
TopSpndr2PurchTotal = SNsTop_df.iloc[3,4]
TopSpndr2AvgPurch = TopSpndr2PurchTotal / TopSpndr2purchCt
SNsTop_df.iloc[3,6] = round(TopSpndr2AvgPurch,2)

In [None]:
TopSpndr1PurchTotal = SNsTop_df.iloc[4,4]
TopSpndr1AvgPurch = TopSpndr1PurchTotal / TopSpndr1purchCt
SNsTop_df.iloc[4,6] = round(TopSpndr1AvgPurch,2)

#rename Price as Total Purchase Value
SNsTop_df = SNsTop_df.rename(columns={"Price" : "Total Purchase Value"})

In [None]:
#rename Purchase ID as [blank]
SNsTop_df = SNsTop_df.rename(columns={"Purchase ID" : ""})
SNsTop_df.iloc[0,1] = ""
SNsTop_df.iloc[1,1] = ""
SNsTop_df.iloc[2,1] = ""
SNsTop_df.iloc[3,1] = ""
SNsTop_df.iloc[4,1] = ""

# SNsTop_df

In [None]:
#rename Age as [blank]
SNsTop_df = SNsTop_df.rename(columns={"Age" : ""})

SNsTop_df.iloc[0,2] = ""
SNsTop_df.iloc[1,2] = ""
SNsTop_df.iloc[2,2] = ""
SNsTop_df.iloc[3,2] = ""
SNsTop_df.iloc[4,2] = ""

SNsTop_df

In [None]:
# 7. Most popular items
#    a. identify the five most popular items purchase count
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. total purchase value

In [None]:
itemsCount = purchaseData_df.groupby(["Item ID"])
itemsCount_df = pd.DataFrame(itemsCount.count())
# itemsCount_df

In [None]:
itemsTopCount_df = itemsCount_df.sort_values(by="Price", ascending=False)
itemsTopCount_df = itemsTopCount_df.head()
# itemsTopCount_df

In [None]:
itemsSales_df = purchaseData_df.loc[:,["Item ID", "Item Name", "Price"]]
itemsSales_total = itemsSales_df.groupby(["Item ID", "Item Name"]).sum()["Price"]
purchaseCount = itemsSales_df.groupby(["Item ID", "Item Name"]).count()["Price"]
itemsSalesData = pd.DataFrame({"Price":itemsSales_total, "Purchase Count":purchaseCount})
itemsSalesData = itemsSalesData.sort_values(by="Purchase Count", ascending=False)
itemsSalesData.head(5)

In [None]:
# 8 most profitable items
#    a. identify the five most profitable items by total purchase value
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. item price
#         v. total purchase value

itemsTopSales_df = itemsSalesData.sort_values(by="Price", ascending=False)
itemsTopSales_df = itemsTopSales_df.head()
itemsTopSales_df

## Three observable trends:
### 1. Most players--84.0 percent--identify themselves in Heroes of Pymoji as male and account for most of the sales--82.6 percent.
### 2. However, players who do not identify as male in-game spend considerably more per purchase than players who identify themselves as male in-game.
### 3. And players who do not identify as male in-game spend considerably more on average overall than players who identify themselves as male in-game.

#### Thank you for examining this homework assignment (nunber 4: Pandas challenge)
#### in the U of Minnesota Data Analytics and Visualization Boot Camp, Winter 2021.
#### Please email Paul Bernhardt papadiscobravo@gmail.com with questions.