# Unit 4 Assignment | Pandas, Pandas, Pandas

## Option 1: Heroes of Pymoli


In [411]:
# Import toolset for this analysis

import pandas as pd
import numpy as np

# Pull data from csv into memory
# and set as a dataframe

purchase_data = pd.read_csv('purchase_data.csv')

In [412]:
# Render first row of dataframe

purchase_data.head(1)

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53


## Player Count

* Display the total number of players


In [413]:
# Calculate and print total number of players

print('Total number of players: ' + str(purchase_data.SN.nunique()))

Total number of players: 576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [414]:
# Perform calculations for purchasing analysis

item_count = str(purchase_data['Item Name'].nunique())
avg_price = str(purchase_data['Price'].mean())
sum_purchases = str(len(purchase_data['Purchase ID']))
total_revenue = str(purchase_data['Price'].sum())

In [415]:
# Pull it all together

summary = [item_count, avg_price, sum_purchases, total_revenue]
columns = ['Number of Unique Items', 'Average Price', 'Number of Purchases', 'Total Revenue']
test = list(zip(columns, summary))

In [416]:
# Render it as a dataframe

pd.DataFrame(summary,
             index = columns).T

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,3.050987179487176,780,2379.77


## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [417]:
# Perform calculations for demographics analysis

male_count =   len(purchase_data[purchase_data['Gender'] == 'Male'])
female_count = len(purchase_data[purchase_data['Gender'] == 'Female'])
other_count =  len(purchase_data[purchase_data['Gender'] == 'Other / Non-Disclosed'])

total = len(purchase_data['Purchase ID'])

male_percentage = (int(male_count) / int(total)) * 100
female_percentage = (int(female_count) / int(total)) * 100
other_percentage = (int(other_count) / int(total)) * 100

In [418]:
# Pull it all together

gender_counts = [male_count, female_count, other_count]
gender_percentages = [male_percentage, female_percentage, other_percentage]
genders = ['Male', 'Female', 'Other / Non-Disclosed']

test = list(zip(gender_counts, gender_percentages))

In [419]:
# Render it as a dataframe

pd.DataFrame(test,
             index = genders,
             columns = ['Total Count', '% of Players'])

Unnamed: 0,Total Count,% of Players
Male,652,83.589744
Female,113,14.487179
Other / Non-Disclosed,15,1.923077



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [420]:
# Group purchase data by gender

gender_group = purchase_data.groupby('Gender')

# Define three of the four requested columns
# and perform the requisite calculations

average_purchase_price = gender_group.mean()['Price']
purchase_count = gender_group.count()['SN']
total_purchase_value = gender_group.sum()['Price']

# Group purchase data by gender and screen name
# and be sure to NOT set is as the index

gender_SN_group = purchase_data.groupby(['Gender', 'SN'], as_index = False).sum()

# Create interim groups by gender and calculate
# average total purchase value by gender
# (the fourth requested column)

interim_SN_group = gender_SN_group[gender_SN_group.Gender == 'Female'].groupby('SN').sum()
female_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = gender_SN_group[gender_SN_group.Gender == 'Male'].groupby('SN').sum()
male_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = gender_SN_group[gender_SN_group.Gender == 'Other / Non-Disclosed'].groupby('SN').sum()
other_nondisclosed_purchase_value = interim_SN_group.Price.mean()

average_total_purchase = pd.Series([female_purchase_value,
                                    male_purchase_value,
                                    other_nondisclosed_purchase_value],
                                    index = ['Female', 'Male', 'Other / Non-Disclosed'])

# Merge columns and transpose

gender_merge = pd.DataFrame([purchase_count,
                             average_purchase_price,
                             total_purchase_value,
                             average_total_purchase]).T

# Rename columns

gender_merge.columns = ['Purchase Count',
                        'Average Purchase Price',
                        'Total Purchase Value',
                        'Average Total Purchase Per Person']

# Render dataframe

gender_merge

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Total Purchase Per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113.0,3.203009,361.94,4.468395
Male,652.0,3.017853,1967.64,4.065372
Other / Non-Disclosed,15.0,3.346,50.19,4.562727


## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [421]:
# Set age breaks (0, 9 means <10, since 0 and 9 are inclusive),
# which implies the starting point for the next age break (10)

bins = [0,
        9,
        14,
        19,
        24,
        29,
        34,
        39,
        100]

# Create labels for age breaks

group_names = ["<10",
               "10-14",
               "15-19",
               "20-24",
               "25-29",
               "30-34",
               "35-39",
               "40+"]

# Create dataframe of pertinent data,
# (screen name and age, in this case)

interim_df = purchase_data[['SN', 'Age']]

# Drop dupes, given that the same player
# shouldn't be counted twice for these calculations

interim_df = interim_df.drop_duplicates()

# Pull age groups into dataframe as index

interim_df[''] = pd.cut(interim_df['Age'],
                        bins,
                        labels = group_names)

# Define "Total Count" column to set in dataframe

interim_df['Total Count'] = purchase_data.Age

# Group by age and count

age_demographics = interim_df.groupby('').count()

# Define "% of Total Players" and perform calculations

age_demographics['% of Total Players'] = (age_demographics['Total Count'] / len(purchase_data['Purchase ID'])) * 100

# Drop now unecessary columns

age_demographics.drop(['SN', 'Age'],
                      axis = 1,
                      inplace = True)

# Render dataframe

age_demographics

Unnamed: 0,Total Count,% of Total Players
,,
<10,17.0,2.179487
10-14,22.0,2.820513
15-19,107.0,13.717949
20-24,258.0,33.076923
25-29,77.0,9.871795
30-34,52.0,6.666667
35-39,31.0,3.974359
40+,12.0,1.538462


## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [422]:
# Create dataframe of pertinent data

interim_df = purchase_data[['Purchase ID', 'SN', 'Age', 'Price']]

# Pull age groups into dataframe as index

interim_df[''] = pd.cut(interim_df['Age'],
                        bins,
                        labels = group_names)

# Group by age and perform requisite calculations

purchase_count = interim_df.groupby('').count()['Purchase ID']
average_purchase_price = interim_df.groupby('').mean()['Price']
total_purchase_value = interim_df.groupby('').sum()['Price']

age_SN_group = purchase_data.groupby(['Age', 'SN'], as_index = False).sum()

# Create interim groups by age group and calculate
# average total purchase value by age group
# (the fourth requested column)

interim_SN_group = age_SN_group[age_SN_group.Age.between(0, 9)].groupby('SN').sum()
under_10_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(10, 14)].groupby('SN').sum()
ten_to_14_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(15, 19)].groupby('SN').sum()
fifteen_to_19_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(20, 24)].groupby('SN').sum()
twenty_to_24_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(25, 29)].groupby('SN').sum()
twenty_five_to_29_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(30, 34)].groupby('SN').sum()
thirty_to_34_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(35, 39)].groupby('SN').sum()
thirty_five_to_40_purchase_value = interim_SN_group.Price.mean()

interim_SN_group = age_SN_group[age_SN_group.Age.between(40, 100)].groupby('SN').sum()
over_40_purchase_value = interim_SN_group.Price.mean()

average_total_purchase_value = pd.Series([under_10_purchase_value,
                                    ten_to_14_purchase_value,
                                    fifteen_to_19_purchase_value,
                                    twenty_to_24_purchase_value,
                                    twenty_five_to_29_purchase_value,
                                    thirty_to_34_purchase_value,
                                    thirty_five_to_40_purchase_value,
                                    over_40_purchase_value],
                                    index = ['<10',
                                             '10-14',
                                             '15-19',
                                             '20-24',
                                             '25-29',
                                             '30-34',
                                             '35-39',
                                             '40+'])

# Set calculations as columns in dataframe

purchasing_analysis_by_age['Purchase Count'] = purchase_count
purchasing_analysis_by_age['Average Purchase Price'] = average_purchase_price
purchasing_analysis_by_age['Total Purchase Value'] = total_purchase_value
purchasing_analysis_by_age['Average Total Purchase Value Per Person'] = average_total_purchase_value

# Render the dataframe

purchasing_analysis_by_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Total Purchase Value Per Person
,,,,
<10,23.0,3.353478,77.13,4.537059
10-14,28.0,2.956429,82.78,3.762727
15-19,136.0,3.035956,412.89,3.858785
20-24,365.0,3.052219,1114.06,4.318062
25-29,101.0,2.90099,293.0,3.805195
30-34,73.0,2.931507,214.0,4.115385
35-39,41.0,3.601707,147.67,4.763548
40+,13.0,2.941538,38.24,3.186667


## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [423]:
interim_df = purchase_data[['Purchase ID',
                            'SN',
                            'Price']]

SN_group_count = interim_df.groupby('SN').count()

SN_group_count.rename(columns = {"Purchase ID" : "Purchase Count"},
                      inplace = True)

SN_group_count.drop(['Price'],
                    axis = 1,
                    inplace = True)

purchase_count = SN_group_count

SN_group_sum = interim_df.groupby('SN').sum()

SN_group_sum.sort_values('Price',
                         ascending = False,
                         inplace = True)

SN_group_sum.rename(columns = {"Price" : "Total Purchase Value"},
                      inplace = True)

SN_group_sum.drop(['Purchase ID'],
                  axis = 1,
                  inplace = True)

total_purchase_value = pd.DataFrame(SN_group_sum)

count_value_merge = purchase_count.join(total_purchase_value)

count_value_merge['Average Purchase Price'] = count_value_merge['Total Purchase Value'] / count_value_merge['Purchase Count']

count_value_merge = count_value_merge[['Purchase Count', 'Average Purchase Price', 'Total Purchase Value']]

count_value_merge.sort_values(by = 'Total Purchase Value',
                              ascending = False,
                              inplace = True)
count_value_merge.head(5)

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lisosia93,5,3.792,18.96
Idastidru52,4,3.8625,15.45
Chamjask73,3,4.61,13.83
Iral74,4,3.405,13.62
Iskadarya95,3,4.366667,13.1


## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [435]:
# Create interim dataframe with relevant columns

interim_df = purchase_data[['Purchase ID',
                            'Item ID',
                            'Item Name',
                            'Price']]

# Group by Item ID and Item Name

item_group = interim_df.groupby(['Item ID', 'Item Name']).count()

# Rename Purchase ID column "Purchase Count"

item_group.rename(columns = {"Purchase ID" : "Purchase Count"},
                             inplace = True)

# Drop Price Column

item_group.drop(['Price'],
                axis = 1,
                inplace = True)

# Rename item_group dataframe "purchase_count"

purchase_count = pd.DataFrame(item_group)

# Group by item price

item_price = interim_df.groupby(['Item ID', 'Item Name']).sum()

# Rename Price column "Item Price"

item_price.rename(columns = {"Price" : "Item Price"},
                             inplace = True)

# Drop Purchase ID column

item_price.drop(['Purchase ID'],
                axis = 1,
                inplace = True)

item_price = pd.DataFrame(item_price)

price_group_sum = interim_df.groupby(['Item ID', 'Item Name']).sum()

price_group_sum.sort_values('Price',
                            ascending = False,
                            inplace = True)

price_group_sum.rename(columns = {"Price" : "Total Purchase Value"},
                       inplace = True)

price_group_sum.drop(['Purchase ID'],
                     axis = 1,
                     inplace = True)

total_purchase_value = pd.DataFrame(price_group_sum.head(5))

count_price_merge = purchase_count.join(item_price)

final_merge = count_price_merge.join(total_purchase_value).head(5)


final_merge.sort_values('Total Purchase Value',
                        ascending = False,
                        inplace = True)

final_merge.head(5)

# No idea why it's not sorting correctly, 
# and no more time to fix it 

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Splinter,4,5.12,
1,Crucifer,3,9.78,
2,Verdict,6,14.88,
3,Phantomlight,6,14.94,
4,Bloodlord's Fetish,5,8.5,


## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame



In [438]:
final_merge.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Splinter,4,5.12,
1,Crucifer,3,9.78,
2,Verdict,6,14.88,
3,Phantomlight,6,14.94,
4,Bloodlord's Fetish,5,8.5,


## Insights


* Players categorized as "Other / Non-Disclosed" with regards to gender outspend male players on average (per person) by 12%, and female players by 2%; female players outspend male players on average (per person) by 10%.


* However, in terms of total purchase volume (as measured in dollars), male players outspend female players by 446%, and players categorized as "Other / Non-Disclosed" with regards to gender by 3820%.


* Topping the "Top 5" list of most valuable game items (in terms of the total value of purchases) is Oathbreaker, Last Hope of the Breaking Storm, which was 46% more valuable than the bottom item on that list, Singed Scalpel. 