In [1]:
# Dependencies and Setup
import pandas as pd
pd.options.display.float_format = '${:,.2f}'.format

In [2]:
# Load and Read purchasing file
dataset = pd.read_csv('Purchase_data.csv')
# Check info
#dataset.info()

## Part 1: Player Count

In [3]:
unique_players = len(pd.unique(dataset['SN']))

In [4]:
part1_df = pd.DataFrame(
    {"Total Players": [unique_players]})
part1_df

Unnamed: 0,Total Players
0,576


## Part 2: Purchasing Analysis (Total)

In [5]:
unique_items = len(pd.unique(dataset['Item Name'])) #unique_items
total_revenue = sum(dataset['Price']) #total_revenue
purchase_count = len(dataset) #purchase_count
average_price = dataset['Price'].mean() #average_price

In [6]:
part2_df = pd.DataFrame(
    {"Number of Unique Items": [unique_items],
     "Average Price": [average_price],
     "Number of Purchases": [purchase_count],
     "Total Revenue": [total_revenue]
     })
part2_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


## Part 3: Gender Demographics

In [7]:
dataset_unique = dataset.drop_duplicates(subset='SN', keep='first', inplace=False)
counts = dataset_unique['Gender'].value_counts()
percents = dataset_unique['Gender'].value_counts(normalize = True)

In [8]:
part3_df = pd.DataFrame(counts)
part3_df['Total Counts'] = list(counts)
part3_df['Percentage of Players'] = list(percents)
part3_df['Percentage of Players'] = pd.Series(["{0:.2f}%".format(val * 100) for val in part3_df['Percentage of Players']], index = part3_df.index)
part3_df = part3_df.drop('Gender', axis = 1)
part3_df.head()

Unnamed: 0,Total Counts,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


## Part 4: Purchasing Analysis (Gender)

In [9]:
dataset_unique = dataset.drop_duplicates(subset='SN', keep='first', inplace=False)
# Grouping the dataset by Id
groups = dataset.groupby('SN')
# getting sum of each individual 
price_sum = groups['Price'].sum()

In [10]:
# Assigning the sum values to col7 to replace the original one as it only accounted for the first item make sure we have accurate information
dataset_unique['Price'] = list(price_sum)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
groups = dataset.groupby('Gender')
groups2 = dataset_unique.groupby('Gender')

In [12]:
#Getting statistics for females
female_dataset = groups.get_group('Female')
#female_dataset.head()
female_dataset['Price'].mean()

3.203008849557519

In [13]:
f_count = female_dataset['Price'].count()
#f_count2 = groups2.get_group('Female')['Price'].count()
f_sum = groups.get_group('Female')['Price'].sum()
f_mean = groups.get_group('Female')['Price'].mean()
f_mean2 = groups2.get_group('Female')['Price'].mean()

In [14]:
#Getting statistics for males
m_count = groups.get_group('Male')['Price'].count()
m_sum = round(groups.get_group('Male')['Price'].sum(),2)
m_mean = round(groups.get_group('Male')['Price'].mean(),2)
m_mean2 = round(groups2.get_group('Female')['Price'].mean(),2)

In [15]:
#Getting statistics for others
o_count = groups.get_group('Other / Non-Disclosed')['Price'].count()
o_sum = round(groups.get_group('Other / Non-Disclosed')['Price'].sum(),2)
o_mean = round(groups.get_group('Other / Non-Disclosed')['Price'].mean(),2)
o_mean2 = round(groups2.get_group('Other / Non-Disclosed')['Price'].mean(),2)

In [16]:
all_data = {'Female': (f_count, f_mean, f_sum, f_mean2),
           'Males': (m_count, m_mean, m_sum, m_mean2),
            'Other / Non-Disclosed': (o_count, o_mean, o_sum, o_mean2)}

In [17]:
dataframe4 = pd.DataFrame(all_data, index = ['Purchase Count','Average Purchase Price', 'Total Purchase Value','Average Total Purchase per Person' ])

In [18]:
dataframe4.T


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Total Purchase per Person
Female,$113.00,$3.20,$361.94,$4.19
Males,$652.00,$3.02,"$1,967.64",$4.19
Other / Non-Disclosed,$15.00,$3.35,$50.19,$4.65


## Part 5: Age Demographics

In [19]:
# Bins and percentages
bin1 = len(dataset['SN'].loc[dataset.Age< 10].unique())
bin1_percent = round(len(dataset['SN'].loc[dataset.Age < 10].unique())/len(dataset['SN'].unique()),4)*100
bin2 = len(dataset['SN'].loc[dataset.Age.between(10, 14)].unique())
bin2_percent = round(len(dataset['SN'].loc[dataset.Age.between(10, 14)].unique())/len(dataset['SN'].unique()), 2)*100
bin3 = len(dataset['SN'].loc[dataset.Age.between(15, 19)].unique())
bin3_percent = round(len(dataset['SN'].loc[dataset.Age.between(15, 19)].unique())/len(dataset['SN'].unique()), 2)*100
bin4 = len(dataset['SN'].loc[dataset.Age.between(20, 24)].unique())
bin4_percent= round(len(dataset['SN'].loc[dataset.Age.between(20, 24)].unique())/len(dataset['SN'].unique()), 2)*100
bin5 = len(dataset['SN'].loc[dataset.Age.between(25, 29)].unique())
bin5_percent= round(len(dataset['SN'].loc[dataset.Age.between(25, 29)].unique())/len(dataset['SN'].unique()), 2)*100
bin6 = len(dataset['SN'].loc[dataset.Age.between(30, 34)].unique())
bin6_percent= round(len(dataset['SN'].loc[dataset.Age.between(30, 34)].unique())/len(dataset['SN'].unique()), 2)*100
bin7 = len(dataset['SN'].loc[dataset.Age.between(35, 39)].unique())
bin7_percent= round(len(dataset['SN'].loc[dataset.Age.between(35, 39)].unique())/len(dataset['SN'].unique()), 2)*100
bin8 = len(dataset['SN'].loc[dataset.Age >= 40].unique())
bin8_percent= round(len(dataset['SN'].loc[dataset.Age >= 40].unique())/len(dataset['SN'].unique()), 2)*100

In [20]:
dict5 = {'Total_count':(bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8),
             'Percentage':(bin1_percent,bin2_percent,bin3_percent,bin4_percent,bin5_percent,bin6_percent,bin7_percent,bin8_percent)}

In [21]:
dataframe5 = pd.DataFrame(dict5, index = ['<10', '10-14', '15-19','20-24','25-29',
                                         '30-34', '35-39', '40+'])
dataframe5

Unnamed: 0,Total_count,Percentage
<10,17,$2.95
10-14,22,$4.00
15-19,107,$19.00
20-24,258,$45.00
25-29,77,$13.00
30-34,52,$9.00
35-39,31,$5.00
40+,12,$2.00


## Part 6: Purchasing Analysis (Age)

In [22]:
bin1 = len(dataset.loc[dataset.Age < 10]['Price'])
bin1_total = dataset.loc[dataset.Age < 10]['Price'].sum()
bin1_average = dataset.loc[dataset.Age < 10]['Price'].mean()
bin1_average2 = dataset_unique.loc[dataset.Age < 10]['Price'].mean()

bin2 = len(dataset.loc[dataset.Age.between(10,14)]['Price'])
bin2_total = dataset.loc[dataset.Age.between(10,14)]['Price'].sum()
bin2_average = dataset.loc[dataset.Age.between(10,14)]['Price'].mean()
bin2_average2 = dataset_unique.loc[dataset.Age.between(10,14)]['Price'].mean()

bin3 = len(dataset.loc[dataset.Age.between(15,19)]['Price'])
bin3_total = dataset.loc[dataset.Age.between(15,19)]['Price'].sum()
bin3_average = dataset.loc[dataset.Age.between(15,19)]['Price'].mean()
bin3_average2 = dataset_unique.loc[dataset.Age.between(15,19)]['Price'].mean()

bin4 = len(dataset.loc[dataset.Age.between(20,24)]['Price'])
bin4_total = dataset.loc[dataset.Age.between(20,24)]['Price'].sum()
bin4_average = dataset.loc[dataset.Age.between(20,24)]['Price'].mean()
bin4_average2 = dataset_unique.loc[dataset.Age.between(20,24)]['Price'].mean()

bin5 = len(dataset.loc[dataset.Age.between(25,29)]['Price'])
bin5_total = dataset.loc[dataset.Age.between(25,29)]['Price'].sum()
bin5_average = dataset.loc[dataset.Age.between(25,29)]['Price'].mean()
bin5_average2 = dataset_unique.loc[dataset.Age.between(25,29)]['Price'].mean()

bin6 = len(dataset.loc[dataset.Age.between(30,34)]['Price'])
bin6_total = dataset.loc[dataset.Age.between(30,34)]['Price'].sum()
bin6_average = dataset.loc[dataset.Age.between(30,34)]['Price'].mean()
bin6_average2 = dataset_unique.loc[dataset.Age.between(30,34)]['Price'].mean()

bin7 = len(dataset.loc[dataset.Age.between(35,39)]['Price'])
bin7_total = dataset.loc[dataset.Age.between(35,39)]['Price'].sum()
bin7_average = dataset.loc[dataset.Age.between(35,39)]['Price'].mean()
bin7_average2 = dataset_unique.loc[dataset.Age.between(35,39)]['Price'].mean()

bin8 = len(dataset.loc[dataset.Age > 40]['Price'])
bin8_total = dataset.loc[dataset.Age > 40]['Price'].sum()
bin8_average = dataset.loc[dataset.Age > 40]['Price'].mean()
bin8_average2 = dataset_unique.loc[dataset.Age > 40]['Price'].mean()

In [23]:
dict_6 =  {'Total_count':(bin1,bin2,bin3,bin4,bin5,bin6,bin7, bin8),
          'Total':(bin1_total,bin2_total,bin3_total,bin4_total,bin5_total,bin6_total,bin7_total, bin8_total),
          'average_price': (bin1_average,bin2_average,bin3_average,bin4_average,bin5_average,
           bin6_average,bin7_average,bin8_average), 
          'Average_per_person':(bin1_average2,bin2_average2,bin3_average2,bin4_average2,bin5_average2,
           bin6_average2,bin7_average2,bin8_average2)}

In [24]:
dataframe_6 = pd.DataFrame(dict_6, index = ['<10', '10-14', '15-19','20-24','25-29',
                                         '30-34', '35-39', '40+'])

dataframe_6

Unnamed: 0,Total_count,Total,average_price,Average_per_person
<10,23,$77.13,$3.35,$5.32
10-14,28,$82.78,$2.96,$4.32
15-19,136,$412.89,$3.04,$4.11
20-24,365,"$1,114.06",$3.05,$4.10
25-29,101,$293.00,$2.90,$4.16
30-34,73,$214.00,$2.93,$3.86
35-39,41,$147.67,$3.60,$4.48
40+,7,$21.53,$3.08,$3.15
