welcome to
Heroes of Pymoji: The Sales Analysis

In [1]:
# dependencies
import pandas as pd
import numpy as np

In [None]:
dataFilepath = "Resources/purchase_data.csv"
dataFilepath

In [None]:
# import the data in the file at dataFilepath and make a data frame of it
purchaseData_df = pd.read_csv(dataFilepath)
purchaseData_df.head()

In [None]:
# This should be a groupby object of unique SNs:
SNs_df = purchaseData_df.groupby(["SN"])
SNs_df.head()

In [None]:
# This makes a dataframe from SNs_df that shows each SN's
# total purchases, but which unfortunately leaves out gender
# because it can't be summed:
SNsTotalPurchases_df = pd.DataFrame(SNs_df.sum())
SNsTotalPurchases_df

In [None]:
# This makes a dataframe from SNs_df that shows how many 
# purchases each SN made:
SNsCountPurchases_df = pd.DataFrame(SNs_df.count())
SNsCountPurchases_df

In [None]:
# This starts with the same groupby object of unique SNs, but turns
# it into a DataFrame that retains each SN's gender:
SNsTotalByFirst_df = pd.DataFrame(SNs_df.first())
SNsTotalByFirst_df
# I'll need both SNsTotalPurchases_df and SNsCountPurchases_df later.

In [None]:
# 1. Count unique players
PNcount = len(SNs_df)
PNcount

In [None]:
# 2. Purchasing analysis
#    a. no. of unique items
itemCount = len(purchaseData_df["Item ID"].value_counts())
itemCount

In [None]:
#    b. avg purchase price
purchaseAvg = purchaseData_df["Price"].sum() / purchaseData_df["Price"].count()
showPurchaseAvg = "$" + str(round(float(purchaseAvg), 2))
showPurchaseAvg

In [None]:
#    c. total no. of purchases
purchaseCount = purchaseData_df["SN"].count()
purchaseCount

In [None]:
#    d. total revenue
totalRevenue = purchaseData_df["Price"].sum()
showTotalRevenue = "$" + str(totalRevenue)
showTotalRevenue

In [None]:
# 3. Gender demographics

# This is a groupby object created to get total sales by gender
# (in the Price column).
SalesByGender = purchaseData_df.groupby(["Gender"])
SalesByGender_df = pd.DataFrame(SalesByGender.sum())
SalesByGender_df
# Ignore the sums of Purchase ID, Age, and Item ID, which are meaningless.

In [None]:
# This is a groupby object of unique PNs by gender, 
# using the dataframe of unique SNs that retains gender...:
genderReport = SNsTotalByFirst_df.groupby(["Gender"])
# ...and making it a DataFrame:
genderReport_df = pd.DataFrame(genderReport.count())
genderReport_df

In [None]:
#    a. pct and count of male players

In [None]:
# count
maleCount = int(genderReport_df.iloc[1][1])
maleCount

In [None]:
# percent
malePct = maleCount / PNcount
malePctShow = str(round(malePct*100,1))+"%"
malePctShow

In [None]:
#    b. pct and count of female players

In [None]:
# count
femaleCount = int(genderReport_df.iloc[0][0])
femaleCount

In [None]:
# percent
femalePct = femaleCount / PNcount
femalePctShow = str(round(femalePct*100,1))+"%"
femalePctShow

In [None]:
#    c. pct and count of other/non-disclosed gender players

In [None]:
# count
nonBinCount = int(genderReport_df.iloc[2][2])
nonBinCount

In [None]:
# percent
nonBinPct = nonBinCount / PNcount
nonBinPctShow = str(round(nonBinPct*100,1))+"%"
nonBinPctShow

In [None]:
# 4. Purchasing analysis: gender
SalesCountByGender = purchaseData_df.groupby(["Gender"])
SalesCountByGender_df = pd.DataFrame(SalesCountByGender.count())
SalesCountByGender_df

In [None]:
#    a. male players

In [None]:
#         i. purchase count
salesCountMale = SalesCountByGender_df.loc['Male','SN']
salesCountMale

In [None]:
#        ii. total purchase value
salesMale = SalesByGender_df.loc['Male','Price']
salesMaleShow = "$"+str(round(float(salesMale),2))
salesMaleShow


In [None]:
#       iii. avg purchase price
salesAvgMale = salesMale / salesCountMale
salesAvgMaleShow = "$"+str(round(float(salesAvgMale),2))
salesAvgMaleShow

In [None]:
#        iv. avg purchase total per person by gender
salesAvgPerMale = salesMale / maleCount
salesAvgPerMaleShow = "$"+str(round(float(salesAvgPerMale),2))
salesAvgPerMaleShow

In [None]:
#    b. female players

In [None]:
#         i. purchase count
salesCountFemale = SalesCountByGender_df.loc['Female','SN']
salesCountFemale

In [None]:
#        ii. total purchase value
salesFemale = SalesByGender_df.loc['Female','Price']
salesFemaleShow = "$"+str(round(float(salesFemale),2))
salesFemaleShow

In [None]:
#       iii. avg purchase price
salesAvgFemale = salesFemale / salesCountFemale
salesAvgFemaleShow = "$"+str(round(float(salesAvgFemale),2))
salesAvgFemaleShow

In [None]:
#        iv. avg purchase total per person by gender
salesAvgPerFemale = salesFemale / femaleCount
salesAvgPerFemaleShow = "$"+str(round(float(salesAvgPerFemale),2))
salesAvgPerFemaleShow

In [None]:
#    c. players of other/non-disclosed gender

In [None]:
#         i. purchase count
salesCountNonBin = SalesCountByGender_df.loc['Other / Non-Disclosed','SN']
salesCountNonBin

In [None]:
#        ii. total purchase value
salesNonBin = SalesByGender_df.loc['Other / Non-Disclosed','Price']
salesNonBinShow = "$"+str(round(float(salesNonBin),2))
salesNonBinShow

In [None]:
#       iii. avg purchase price
salesAvgNonBin = salesNonBin / salesCountNonBin
salesAvgNonBinShow = "$"+str(round(float(salesAvgNonBin),2))
salesAvgNonBinShow

In [None]:
#        iv. avg purchase total per person by gender
salesAvgPerNonBin = salesNonBin / nonBinCount
salesAvgPerNonBinShow = "$"+str(round(float(salesAvgPerNonBin),2))
salesAvgPerNonBinShow

In [None]:
# 5. Age demographics in bins of 4 years: <10, 10-14, 15-19, und so weiter:
#    a. purchase count
#    b. avg purchase price
#    c. total purchase value
#    d. avg purchase total per person by age group
# (This must use purchaseData_df.)


# This will add a new column to purchaseData_df classifying each player into an age range:
# bins = [0, 10, 14, 19, 24, 29, 34, 39, 44]
# group_names = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", ">40"]
# purchaseData_df["Age_Range"] = pd.cut(purchaseData_df["Age"], bins, labels=group_names, include_lowest=True)
# purchaseData_df


bins = [0, 10, 14, 19, 24, 29, 34, 39, 44]
groupNames = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", ">40"]
#This will add a new column to purchaseData_df
purchaseData_df["Total_Age"] = pd.cut(purchaseData_df["Age"], bins, labels=groupNames, include_lowest=True)
ageTotal = purchaseData_df["Total_Age"].value_counts()
ageTotal

In [None]:
#    a. purchase count
agePurchaseCounts = purchaseData_df.groupby(["Total_Age"]).count()["Price"].rename("Purchase Count")
#    b. avg_purchase_price
ageAvgPurchasePrice= purchaseData_df.groupby(["Total_Age"]).mean()["Price"].rename("Avg Purchase Price")
#    c. total purchase value
totalPurchaseValueAge = purchaseData_df.groupby(["Total_Age"]).sum()["Price"].rename("Total Purchase Value")
#Total Purchase Value
purchaseTotalByAge = purchaseData_df.groupby(["Total_Age"]).sum()["Price"].rename("Total Purchase Value")
#    d. avg purchase total per person by age group
avgPurchasePerson = round(purchaseTotalByAge  / ageTotal, 2)
ageDemographicSummary = pd.DataFrame({"Purchase Count": agePurchaseCounts, "Average Purchase Price": ageAvgPurchasePrice, "Total Purchase Value": totalPurchaseValueAge, "Avg Purchase Per Person" :avgPurchasePerson})

ageDemographicSummary


In [None]:
# 6. Top spenders
#    a. identify the top five spenders by total purchase value
# using the dataframe SNsTotalPurchases_df
# and turning it into a new DataFrame:
SNsTop_df = SNsTotalPurchases_df.sort_values(by="Price", ascending=False)
SNsTop_df = SNsTop_df.head()
SNsTop_df = SNsTop_df.sort_index()
SNsTop_df
# Ignore Purchase ID, Age, and Item ID in the resulting table and only use Price.

In [None]:
# This searches SNsCountPurchases_df for the top five spenders in SNsTop,
# returns how many purchases each of them made, and adds them to SNsTop.
# Now how do I do that?

In [None]:
#    b. and list in a table:
#         i. SN
#        ii. purchase count
#       iii. avg purchase price
#        iv. total purchase value

# I'll keep working with SNsTop_df. I can add a column...
SNsTop_df["Purchase Count"] = 1
# But how to populate it?
SNsTop_df

In [None]:
# I'd like to retrieve the name of each of the top five players from SNsTop_df,
# but it seems I need to take the index and make it a data column first:
SNsTop_df.reset_index(inplace=True)
SNsTop_df

In [None]:
# Now I can fetch each the top spenders' SN...:
TopSpndr5 = SNsTop_df.iloc[0,0]
TopSpndr5

In [None]:
TopSpndr4 = SNsTop_df.iloc[1,0]
TopSpndr4

In [None]:
TopSpndr3 = SNsTop_df.iloc[2,0]
TopSpndr3

In [None]:
TopSpndr2 = SNsTop_df.iloc[3,0]
TopSpndr2

In [None]:
TopSpndr1 = SNsTop_df.iloc[4,0]
TopSpndr1

In [None]:
TopSpndr5purchCt = int(SNsCountPurchases_df.loc[TopSpndr5,"Purchase ID"])
TopSpndr5purchCt

In [None]:
# ...but how do I get the Purchase Count column to store each top spender's purchase count?

#test = SNsTop_df.iloc[1, 6]
#test

SNsTop_df.iloc[0,5] = TopSpndr5purchCt
SNsTop_df

In [None]:
TopSpndr4purchCt = int(SNsCountPurchases_df.loc[TopSpndr4,"Purchase ID"])
SNsTop_df.iloc[1,5] = TopSpndr4purchCt

In [None]:
TopSpndr3purchCt = int(SNsCountPurchases_df.loc[TopSpndr3,"Purchase ID"])
SNsTop_df.iloc[2,5] = TopSpndr3purchCt

In [None]:
TopSpndr2purchCt = int(SNsCountPurchases_df.loc[TopSpndr2,"Purchase ID"])
SNsTop_df.iloc[3,5] = TopSpndr5purchCt

In [None]:
TopSpndr1purchCt = int(SNsCountPurchases_df.loc[TopSpndr1,"Purchase ID"])
SNsTop_df.iloc[4,5] = TopSpndr5purchCt
SNsTop_df

In [None]:
# I know avg purchase price is just total / count but how do I do that?
SNsTop_df["Avg Purchase Price"] = 1 #value of Price column / value of Purchase Count column
SNsTop_df

In [None]:
# Read this SN's purchase total from SNsTop_df:
TopSpndr5PurchTotal = SNsTop_df.iloc[0,4]

#Calculate this SN's average purchase:
TopSpndr5AvgPurch = TopSpndr5PurchTotal / TopSpndr5purchCt

#Write this SN's average purchase to SNsTop_df:
SNsTop_df.iloc[0,6] = round(TopSpndr5AvgPurch,2)

In [None]:
TopSpndr4PurchTotal = SNsTop_df.iloc[1,4]
TopSpndr4AvgPurch = TopSpndr4PurchTotal / TopSpndr4purchCt
SNsTop_df.iloc[1,6] = round(TopSpndr4AvgPurch,2)

In [None]:
TopSpndr3PurchTotal = SNsTop_df.iloc[2,4]
TopSpndr3AvgPurch = TopSpndr3PurchTotal / TopSpndr3purchCt
SNsTop_df.iloc[2,6] = round(TopSpndr3AvgPurch,2)

In [None]:
TopSpndr2PurchTotal = SNsTop_df.iloc[3,4]
TopSpndr2AvgPurch = TopSpndr2PurchTotal / TopSpndr2purchCt
SNsTop_df.iloc[3,6] = round(TopSpndr2AvgPurch,2)

In [None]:
TopSpndr1PurchTotal = SNsTop_df.iloc[4,4]
TopSpndr1AvgPurch = TopSpndr1PurchTotal / TopSpndr1purchCt
SNsTop_df.iloc[4,6] = round(TopSpndr1AvgPurch,2)
SNsTop_df

In [None]:
# 7. Most popular items
#    a. identify the five most popular items purchase count
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. total purchase value

In [None]:
itemsCount = purchaseData_df.groupby(["Item ID"])
itemsCount_df = pd.DataFrame(itemsCount.count())
itemsCount_df

In [None]:
itemsTopCount_df = itemsCount_df.sort_values(by="Price", ascending=False)
itemsTopCount_df = itemsTopCount_df.head()
itemsTopCount_df

# I'd like to strike Purchase ID and Age and add name of item...

In [None]:
# This shows how much revenue each in-game item generated:
itemsSales_df = purchaseData_df.loc[:,["Item ID", "Item Name", "Price"]]
itemsSales_df = purchaseData_df.groupby(["Item ID", "Item Name"]).sum()["Price"]
purchaseCount = purchaseData_df.groupby(["Item ID", "Item Name"]).count()["Price"]
itemsSales_df = pd.DataFrame({itemsSales_df, purchaseCount})
itemsSales_df

# This gives the following error:
# TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [None]:
itemsSales_df = purchaseData_df.loc[:,["Item ID", "Item Name", "Price"]]
itemsSales_total = itemsSales_df.groupby(["Item ID", "Item Name"]).sum()["Price"]
purchaseCount = itemsSales_df.groupby(["Item ID", "Item Name"]).count()["Price"]
itemsSales_data = pd.DataFrame({"Price":itemsSales_total, "Purchase Count":purchaseCount})
itemsSales_data.head(5)

In [None]:
# 8 most profitable items
#    a. identify the five most profitable items by total purchase value
#    b. and list in a table:
#         i. item ID
#        ii. item name
#       iii. purchase count
#        iv. item price
#         v. total purchase value

itemsTopSales_df = itemsSales_data.sort_values(by="Price", ascending=False)
itemsTopSales_df = itemsTopSales_df.head()
itemsTopSales_df

In [None]:
# Thank you for examining this homework assignment (nunber 4: Pandas challenge)
# in the U of Minnesota Data Analytics and Visualization Boot Camp, Winter 2021.
# Please email Paul Bernhardt papadiscobravo@gmail.com with questions.