#### welcome to
# Heroes of Pymoji:
## The Sales Analysis

In [1]:
# dependencies
import pandas as pd
import numpy as np

In [2]:
dataFilepath = "Resources/purchase_data.csv"
dataFilepath

'Resources/purchase_data.csv'

In [3]:
# import the data in the file at dataFilepath and make a data frame of it
purchaseData_df = pd.read_csv(dataFilepath)
# purchaseData_df.head()

In [4]:
# This should be a groupby object of unique SNs:
SNs_df = purchaseData_df.groupby(["SN"])
# SNs_df.head()

In [5]:
# This makes a dataframe from SNs_df that shows each SN's
# total purchases, but which unfortunately leaves out gender
# because it can't be summed:
SNsTotalPurchases_df = pd.DataFrame(SNs_df.sum())
# SNsTotalPurchases_df

In [6]:
# This makes a dataframe from SNs_df that shows how many 
# purchases each SN made:
SNsCountPurchases_df = pd.DataFrame(SNs_df.count())
# SNsCountPurchases_df

In [7]:
# This starts with the same groupby object of unique SNs, but turns
# it into a DataFrame that retains each SN's gender:
SNsTotalByFirst_df = pd.DataFrame(SNs_df.first())
# SNsTotalByFirst_df
# I'll need both SNsTotalPurchases_df and SNsCountPurchases_df later.

In [8]:
# 1. Count unique players
PNcount = len(SNs_df)
print(f"Total number of players: {PNcount}")

Total number of players: 576


In [9]:
# 2. Purchasing analysis
#    a. no. of unique items
itemCount = len(purchaseData_df["Item ID"].value_counts())

print(f"Number of unique items: {itemCount}")

Number of unique items: 179


In [10]:
#    b. avg purchase price
purchaseAvg = purchaseData_df["Price"].sum() / purchaseData_df["Price"].count()
showPurchaseAvg = "$" + str(round(float(purchaseAvg), 2))

print(f"Average purchase price: {showPurchaseAvg}")

Average purchase price: $3.05


In [11]:
#    c. total no. of purchases
purchaseCount = purchaseData_df["SN"].count()

print(f"Total number of purchases: {purchaseCount}")

Total number of purchases: 780


In [12]:
#    d. total revenue
totalRevenue = purchaseData_df["Price"].sum()
showTotalRevenue = "$" + str(totalRevenue)

print(f"Total revenue: {showTotalRevenue}")

Total revenue: $2379.77


In [13]:
# 3. Gender demographics

# This is a groupby object created to get total sales by gender
# (in the Price column).
SalesByGender = purchaseData_df.groupby(["Gender"])
SalesByGender_df = pd.DataFrame(SalesByGender.sum())
# SalesByGender_df
# Ignore the sums of Purchase ID, Age, and Item ID, which are meaningless.

In [14]:
# This is a groupby object of unique PNs by gender, 
# using the dataframe of unique SNs that retains gender...:
genderReport = SNsTotalByFirst_df.groupby(["Gender"])
# ...and making it a DataFrame:
genderReport_df = pd.DataFrame(genderReport.count())
# genderReport_df

In [15]:
#    a. pct and count of male players

In [16]:
# count
maleCount = int(genderReport_df.iloc[1][1])

print(f"Number of players who identify as male: {maleCount}")

Number of players who identify as male: 484


In [17]:
# percent
malePct = maleCount / PNcount
malePctShow = str(round(malePct*100,1))+"%"

print(f"Percent of players who identify as male: {malePctShow}")

Percent of players who identify as male: 84.0%


In [18]:
#    b. pct and count of female players

In [19]:
# count
femaleCount = int(genderReport_df.iloc[0][0])

print(f"Number of players who identify as female: {femaleCount}")

Number of players who identify as female: 81


In [20]:
# percent
femalePct = femaleCount / PNcount
femalePctShow = str(round(femalePct*100,1))+"%"

print(f"Percent of players who identify as female: {femalePctShow}")

Percent of players who identify as female: 14.1%


In [21]:
#    c. pct and count of other/non-disclosed gender players

In [22]:
# count
nonBinCount = int(genderReport_df.iloc[2][2])
nonBinCount

print(f"Number of players who do not identify as male or female\nor who did not disclose their gender: {nonBinCount}")

Number of players who do not identify as male or female
or who did not disclose their gender: 11


In [23]:
# percent
nonBinPct = nonBinCount / PNcount
nonBinPctShow = str(round(nonBinPct*100,1))+"%"

print(f"Pct of players who do not identify as male or female\nor who did not disclose their gender: {nonBinPctShow}")

Pct of players who do not identify as male or female
or who did not disclose their gender: 1.9%


In [24]:
# 4. Purchasing analysis: gender
SalesCountByGender = purchaseData_df.groupby(["Gender"])
SalesCountByGender_df = pd.DataFrame(SalesCountByGender.count())
# SalesCountByGender_df

In [25]:
#    a. male players

In [26]:
#         i. purchase count
salesCountMale = SalesCountByGender_df.loc['Male','SN']
print(f"Number of purchases made by players who identify as male: {salesCountMale}")

Number of purchases made by players who identify as male: 652


In [27]:
#        ii. total purchase value
salesMale = SalesByGender_df.loc['Male','Price']
salesMaleShow = "$"+str(round(float(salesMale),2))

print(f"Total of purchases made by players who identify as male: {salesMaleShow}")

Total of purchases made by players who identify as male: $1967.64


In [28]:
#       iii. avg purchase price
salesAvgMale = salesMale / salesCountMale
salesAvgMaleShow = "$"+str(round(float(salesAvgMale),2))

print(f"Average purchase made by players who identify as male: {salesAvgMaleShow}")

Average purchase made by players who identify as male: $3.02


In [29]:
#        iv. avg purchase total per person by gender
salesAvgPerMale = salesMale / maleCount
salesAvgPerMaleShow = "$"+str(round(float(salesAvgPerMale),2))

print(f"Average purchase total per player who identifies as male: {salesAvgPerMaleShow}")

Average purchase total per player who identifies as male: $4.07


In [30]:
#    b. female players

In [31]:
#         i. purchase count
salesCountFemale = SalesCountByGender_df.loc['Female','SN']

print(f"Number of purchases made by players who identify as female: {salesCountFemale}")

Number of purchases made by players who identify as female: 113


In [32]:
#        ii. total purchase value
salesFemale = SalesByGender_df.loc['Female','Price']
salesFemaleShow = "$"+str(round(float(salesFemale),2))

print(f"Total of purchases made by players who identify as female: {salesFemaleShow}")

Total of purchases made by players who identify as female: $361.94


In [33]:
#       iii. avg purchase price
salesAvgFemale = salesFemale / salesCountFemale
salesAvgFemaleShow = "$"+str(round(float(salesAvgFemale),2))

print(f"Average purchase made by players who identify as female: {salesAvgFemaleShow}")

Average purchase made by players who identify as female: $3.2


In [34]:
#        iv. avg purchase total per person by gender
salesAvgPerFemale = salesFemale / femaleCount
salesAvgPerFemaleShow = "$"+str(round(float(salesAvgPerFemale),2))

print(f"Average purchase total per player who identifies as female: {salesAvgPerFemaleShow}")

Average purchase total per player who identifies as female: $4.47


In [35]:
#    c. players of other/non-disclosed gender

In [36]:
#         i. purchase count
salesCountNonBin = SalesCountByGender_df.loc['Other / Non-Disclosed','SN']

print(f"Number of purchases made by players who do not identify as male or female\nor who did not disclose their gender: {salesCountNonBin}")

Number of purchases made by players who do not identify as male or female
or who did not disclose their gender: 15


In [37]:
#        ii. total purchase value
salesNonBin = SalesByGender_df.loc['Other / Non-Disclosed','Price']
salesNonBinShow = "$"+str(round(float(salesNonBin),2))


print(f"Total of purchases made by players who do not identify as male or female\nor who did not disclose their gender: {salesNonBinShow}")

Total of purchases made by players who do not identify as male or female
or who did not disclose their gender: $50.19


In [38]:
#       iii. avg purchase price
salesAvgNonBin = salesNonBin / salesCountNonBin
salesAvgNonBinShow = "$"+str(round(float(salesAvgNonBin),2))


print(f"Average purchase made by players who do not identify as male or female\nor who did not disclose their gender: {salesAvgNonBinShow}")

Average purchase made by players who do not identify as male or female
or who did not disclose their gender: $3.35


In [39]:
#        iv. avg purchase total per person by gender
salesAvgPerNonBin = salesNonBin / nonBinCount
salesAvgPerNonBinShow = "$"+str(round(float(salesAvgPerNonBin),2))
salesAvgPerNonBinShow



print(f"Average purchase total per player who does not identify as male or female\nor who did not disclose their gender: {salesAvgPerNonBinShow}")

Average purchase total per player who does not identify as male or female
or who did not disclose their gender: $4.56


In [40]:
# 5. Age demographics in bins of 4 years: <10, 10-14, 15-19, und so weiter:
#    a. purchase count
#    b. avg purchase price
#    c. total purchase value
#    d. avg purchase total per person by age group
# (This must use purchaseData_df.)


# This will add a new column to purchaseData_df classifying each player into an age range:
# bins = [0, 10, 14, 19, 24, 29, 34, 39, 44]
# group_names = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", ">40"]
# purchaseData_df["Age_Range"] = pd.cut(purchaseData_df["Age"], bins, labels=group_names, include_lowest=True)
# purchaseData_df

bins = [0, 10, 14, 19, 24, 29, 34, 39, 44]
groupNames = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", ">40"]
#This will add a new column to purchaseData_df
purchaseData_df["Total_Age"] = pd.cut(purchaseData_df["Age"], bins, labels=groupNames, include_lowest=True)
ageTotal = purchaseData_df["Total_Age"].value_counts()
# ageTotal

In [44]:
#    a. purchase count
agePurchaseCounts = purchaseData_df.groupby(["Total_Age"]).count()["Price"].rename("Purchase Count")
#    b. avg_purchase_price
ageAvgPurchasePrice= round(purchaseData_df.groupby(["Total_Age"]).mean()["Price"].rename("Avg Purchase Price"),2)
#    c. total purchase value
totalPurchaseValueAge = purchaseData_df.groupby(["Total_Age"]).sum()["Price"].rename("Total Purchase Value")
#Total Purchase Value
purchaseTotalByAge = purchaseData_df.groupby(["Total_Age"]).sum()["Price"].rename("Total Purchase Value")
#    d. avg purchase total per person by age group
avgPurchasePerson = round(purchaseTotalByAge  / ageTotal, 2)
ageDemographicSummary = pd.DataFrame({"Purchase Count": agePurchaseCounts, "Average Purchase Price": ageAvgPurchasePrice, "Total Purchase Value": totalPurchaseValueAge, "Avg Purchase Total Per Person" :avgPurchasePerson})

ageDemographicSummary


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Purchase Total Per Person
<10,32,3.4,108.96,3.4
10-14,19,2.68,50.95,2.68
15-19,136,3.04,412.89,3.04
20-24,365,3.05,1114.06,3.05
25-29,101,2.9,293.0,2.9
30-34,73,2.93,214.0,2.93
35-39,41,3.6,147.67,3.6
>40,12,3.04,36.54,3.04


In [46]:
# 6. Top spenders
#    a. identify the top five spenders by total purchase value
# using the dataframe SNsTotalPurchases_df
# and turning it into a new DataFrame:
SNsTop_df = SNsTotalPurchases_df.sort_values(by="Price", ascending=False)
SNsTop_df = SNsTop_df.head()
SNsTop_df = SNsTop_df.sort_index()
# SNsTop_df
# Ignore Purchase ID, Age, and Item ID in the resulting table and only use Price.

Unnamed: 0_level_0,Purchase ID,Age,Item ID,Price
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chamjask73,1306,66,339,13.83
Idastidru52,1999,96,527,15.45
Iral74,2285,84,518,13.62
Iskadarya95,713,60,321,13.1
Lisosia93,1630,125,442,18.96


In [47]:
# This searches SNsCountPurchases_df for the top five spenders in SNsTop,
# returns how many purchases each of them made, and adds them to SNsTop.
# Now how do I do that?

In [48]:
#    b. and list in a table:
#         i. SN
#        ii. purchase count
#       iii. avg purchase price
#        iv. total purchase value

# I'll keep working with SNsTop_df. I can add a column...
SNsTop_df["Purchase Count"] = 1
# But how to populate it?
# SNsTop_df

In [49]:
# I'd like to retrieve the name of each of the top five players from SNsTop_df,
# but it seems I need to take the index and make it a data column first:
SNsTop_df.reset_index(inplace=True)
# SNsTop_df

In [50]:
# Now I can fetch each the top spenders' SN...:
TopSpndr5 = SNsTop_df.iloc[0,0]
# TopSpndr5

In [51]:
TopSpndr4 = SNsTop_df.iloc[1,0]
# TopSpndr4

In [52]:
TopSpndr3 = SNsTop_df.iloc[2,0]
# TopSpndr3

In [53]:
TopSpndr2 = SNsTop_df.iloc[3,0]
# TopSpndr2

In [54]:
TopSpndr1 = SNsTop_df.iloc[4,0]
# TopSpndr1

In [55]:
TopSpndr5purchCt = int(SNsCountPurchases_df.loc[TopSpndr5,"Purchase ID"])
# TopSpndr5purchCt

In [56]:
# ...but how do I get the Purchase Count column to store each top spender's purchase count?

#test = SNsTop_df.iloc[1, 6]
#test

SNsTop_df.iloc[0,5] = TopSpndr5purchCt
# SNsTop_df

In [57]:
TopSpndr4purchCt = int(SNsCountPurchases_df.loc[TopSpndr4,"Purchase ID"])
SNsTop_df.iloc[1,5] = TopSpndr4purchCt

In [58]:
TopSpndr3purchCt = int(SNsCountPurchases_df.loc[TopSpndr3,"Purchase ID"])
SNsTop_df.iloc[2,5] = TopSpndr3purchCt

In [59]:
TopSpndr2purchCt = int(SNsCountPurchases_df.loc[TopSpndr2,"Purchase ID"])
SNsTop_df.iloc[3,5] = TopSpndr5purchCt

In [60]:
TopSpndr1purchCt = int(SNsCountPurchases_df.loc[TopSpndr1,"Purchase ID"])
SNsTop_df.iloc[4,5] = TopSpndr5purchCt
# SNsTop_df

In [61]:
# I know avg purchase price is just total / count but how do I do that?
SNsTop_df["Avg Purchase Price"] = 1 #value of Price column / value of Purchase Count column
# SNsTop_df

In [62]:
# Read this SN's purchase total from SNsTop_df:
TopSpndr5PurchTotal = SNsTop_df.iloc[0,4]

#Calculate this SN's average purchase:
TopSpndr5AvgPurch = TopSpndr5PurchTotal / TopSpndr5purchCt

#Write this SN's average purchase to SNsTop_df:
SNsTop_df.iloc[0,6] = round(TopSpndr5AvgPurch,2)

In [63]:
TopSpndr4PurchTotal = SNsTop_df.iloc[1,4]
TopSpndr4AvgPurch = TopSpndr4PurchTotal / TopSpndr4purchCt
SNsTop_df.iloc[1,6] = round(TopSpndr4AvgPurch,2)

In [64]:
TopSpndr3PurchTotal = SNsTop_df.iloc[2,4]
TopSpndr3AvgPurch = TopSpndr3PurchTotal / TopSpndr3purchCt
SNsTop_df.iloc[2,6] = round(TopSpndr3AvgPurch,2)

In [65]:
TopSpndr2PurchTotal = SNsTop_df.iloc[3,4]
TopSpndr2AvgPurch = TopSpndr2PurchTotal / TopSpndr2purchCt
SNsTop_df.iloc[3,6] = round(TopSpndr2AvgPurch,2)

In [66]:
TopSpndr1PurchTotal = SNsTop_df.iloc[4,4]
TopSpndr1AvgPurch = TopSpndr1PurchTotal / TopSpndr1purchCt
SNsTop_df.iloc[4,6] = round(TopSpndr1AvgPurch,2)

#rename Price as Total Purchase Value
SNsTop_df = SNsTop_df.rename(columns={"Price" : "Total Purchase Value"})

In [67]:
#rename Purchase ID as [blank]
SNsTop_df = SNsTop_df.rename(columns={"Purchase ID" : ""})
SNsTop_df.iloc[0,1] = ""
SNsTop_df.iloc[1,1] = ""
SNsTop_df.iloc[2,1] = ""
SNsTop_df.iloc[3,1] = ""
SNsTop_df.iloc[4,1] = ""

# SNsTop_df

Unnamed: 0,SN,Unnamed: 2,Age,Item ID,Total Purchase Value,Purchase Count,Avg Purchase Price
0,Chamjask73,,66,339,13.83,3,4.61
1,Idastidru52,,96,527,15.45,4,3.86
2,Iral74,,84,518,13.62,4,3.4
3,Iskadarya95,,60,321,13.1,3,4.37
4,Lisosia93,,125,442,18.96,3,3.79


In [68]:
#rename Age as [blank]
SNsTop_df = SNsTop_df.rename(columns={"Age" : ""})

SNsTop_df.iloc[0,2] = ""
SNsTop_df.iloc[1,2] = ""
SNsTop_df.iloc[2,2] = ""
SNsTop_df.iloc[3,2] = ""
SNsTop_df.iloc[4,2] = ""

SNsTop_df

Unnamed: 0,SN,Unnamed: 2,Unnamed: 3,Item ID,Total Purchase Value,Purchase Count,Avg Purchase Price
0,Chamjask73,,,339,13.83,3,4.61
1,Idastidru52,,,527,15.45,4,3.86
2,Iral74,,,518,13.62,4,3.4
3,Iskadarya95,,,321,13.1,3,4.37
4,Lisosia93,,,442,18.96,3,3.79


In [69]:
# 7. Most popular items
#    a. identify the five most popular items purchase count
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. total purchase value

In [70]:
itemsCount = purchaseData_df.groupby(["Item ID"])
itemsCount_df = pd.DataFrame(itemsCount.count())
# itemsCount_df

Unnamed: 0_level_0,Purchase ID,SN,Age,Gender,Item Name,Price,Total_Age
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,4,4,4,4,4,4,4
1,4,4,4,4,4,4,4
2,6,6,6,6,6,6,6
3,6,6,6,6,6,6,6
4,5,5,5,5,5,5,5
...,...,...,...,...,...,...,...
178,12,12,12,12,12,12,12
179,6,6,6,6,6,6,6
181,5,5,5,5,5,5,5
182,3,3,3,3,3,3,3


In [71]:
itemsTopCount_df = itemsCount_df.sort_values(by="Price", ascending=False)
itemsTopCount_df = itemsTopCount_df.head()
# itemsTopCount_df

Unnamed: 0_level_0,Purchase ID,SN,Age,Gender,Item Name,Price,Total_Age
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
92,13,13,13,13,13,13,13
178,12,12,12,12,12,12,12
145,9,9,9,9,9,9,9
132,9,9,9,9,9,9,9
108,9,9,9,9,9,9,9


In [76]:
itemsSales_df = purchaseData_df.loc[:,["Item ID", "Item Name", "Price"]]
itemsSales_total = itemsSales_df.groupby(["Item ID", "Item Name"]).sum()["Price"]
purchaseCount = itemsSales_df.groupby(["Item ID", "Item Name"]).count()["Price"]
itemsSalesData = pd.DataFrame({"Price":itemsSales_total, "Purchase Count":purchaseCount})
itemsSalesData = itemsSalesData.sort_values(by="Purchase Count", ascending=False)
itemsSalesData.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Purchase Count
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1
92,Final Critic,59.99,13
178,"Oathbreaker, Last Hope of the Breaking Storm",50.76,12
145,Fiery Glass Crusader,41.22,9
132,Persuasion,28.99,9
108,"Extraction, Quickblade Of Trembling Hands",31.77,9


In [73]:
# 8 most profitable items
#    a. identify the five most profitable items by total purchase value
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. item price
#         v. total purchase value

itemsTopSales_df = itemsSalesData.sort_values(by="Price", ascending=False)
itemsTopSales_df = itemsTopSales_df.head()
itemsTopSales_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Purchase Count
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1
92,Final Critic,59.99,13
178,"Oathbreaker, Last Hope of the Breaking Storm",50.76,12
82,Nirvana,44.1,9
145,Fiery Glass Crusader,41.22,9
103,Singed Scalpel,34.8,8


## Three observable trends:
### 1. Most players--84.0 percent--identify themselves in Heroes of Pymoji as male and account for most of the sales--82.6 percent.
### 2. However, players who do not identify as male in-game spend considerably more per purchase than players who identify themselves as male in-game.
### 3. And players who do not identify as male in-game spend considerably more on average overall than players who identify themselves as male in-game.

#### Thank you for examining this homework assignment (nunber 4: Pandas challenge)
#### in the U of Minnesota Data Analytics and Visualization Boot Camp, Winter 2021.
#### Please email Paul Bernhardt papadiscobravo@gmail.com with questions.