In [1]:
# Import Pandas and Numpy Libraries.
# Numpy stands for "number python".
# Numpy is the library that performs special numeric operations.
import pandas as pd
import numpy as np

# Create a dictionary, which contains a list (of strings) associated with each key
data1 = {'Salesrep':['Sam','Sam','Wendy','Wendy','Dan'],'Weekday':['Monday','Tuesday','Monday','Tuesday','Monday'],'Salesmade':[200,100,300,200,400]}
data1

{'Salesrep': ['Sam', 'Sam', 'Wendy', 'Wendy', 'Dan'],
 'Weekday': ['Monday', 'Tuesday', 'Monday', 'Tuesday', 'Monday'],
 'Salesmade': [200, 100, 300, 200, 400]}

In [2]:
# Create a DataFrame directly from a dictionary structure.
# A DataFrame is a 2-D structure where each column in the DataFrame is a Series.
df = pd.DataFrame(data1)
df

Unnamed: 0,Salesrep,Weekday,Salesmade
0,Sam,Monday,200
1,Sam,Tuesday,100
2,Wendy,Monday,300
3,Wendy,Tuesday,200
4,Dan,Monday,400


In [3]:
# Create a data structure where rows are grouped by 'Salesrep'.
# Therefore, when we perform statistical queries, they will be performed for each group,
# i.e. for each Salesrep.
repgrp = df.groupby('Salesrep')

In [4]:
# Display the average of all numeric variables for each group.
# i.e. Display average 'Salesmade' for each 'Salesrep'.
repgrp['Salesmade'].mean()

# Older versions of Python:
#repgrp.mean()

Salesrep
Dan      400.0
Sam      150.0
Wendy    250.0
Name: Salesmade, dtype: float64

In [5]:
# This does the same as above, but uses the dataframe directly
# instead of the variable repgrp
df.groupby('Salesrep')['Salesmade'].mean()

# Older versions of Python:
#df.groupby('Salesrep').mean()

Salesrep
Dan      400.0
Sam      150.0
Wendy    250.0
Name: Salesmade, dtype: float64

In [6]:
# Display total for all variables for each 'Salesrep'
repgrp.sum()

Unnamed: 0_level_0,Weekday,Salesmade
Salesrep,Unnamed: 1_level_1,Unnamed: 2_level_1
Dan,Monday,400
Sam,MondayTuesday,300
Wendy,MondayTuesday,500


In [7]:
# Display total 'Salesmade' for each 'Salesrep'
repgrp['Salesmade'].sum()

Salesrep
Dan      400
Sam      300
Wendy    500
Name: Salesmade, dtype: int64

In [8]:
# Display standard deviation of 'Salesmade' for each 'Salesrep'
repgrp['Salesmade'].std()

Salesrep
Dan            NaN
Sam      70.710678
Wendy    70.710678
Name: Salesmade, dtype: float64

In [9]:
# Display average 'Salesmade' for each 'Salesrep'.
# This is a little different to the above example.
df.groupby('Salesrep')['Salesmade'].mean()

# Older versions of Python:
#df.groupby('Salesrep').mean()

Salesrep
Dan      400.0
Sam      150.0
Wendy    250.0
Name: Salesmade, dtype: float64

In [10]:
# Display average 'Salesmade' for each 'Salesrep', but only show the first row (Salesrep)
df.groupby('Salesrep')['Salesmade'].mean().iloc[0]

# Older versions of Python:
#df.groupby('Salesrep').mean().iloc[0]

400.0

In [11]:
# Display average 'Salesmade' for each 'Salesrep', but only show Dan's result
df.groupby('Salesrep')['Salesmade'].mean().loc['Dan']

# Older versions of Python:
#df.groupby('Salesrep').mean().loc['Dan']

400.0

In [12]:
# Count all variable values for each variable, for each 'Salesrep'
df.groupby('Salesrep').count()

Unnamed: 0_level_0,Weekday,Salesmade
Salesrep,Unnamed: 1_level_1,Unnamed: 2_level_1
Dan,1,1
Sam,2,2
Wendy,2,2


In [13]:
# Display total 'Salesmade' for each 'Salesrep'.
# This is another way to do it.
df.groupby('Salesrep')['Salesmade'].sum()

Salesrep
Dan      400
Sam      300
Wendy    500
Name: Salesmade, dtype: int64

## Self Practice Exercises

1) Which day has a greater average salesmade (per salesrep)?

In [14]:
# First, group by 'Weekday',
# then get column 'Salesmade',
# then show average
df.groupby('Weekday')['Salesmade'].mean()

Weekday
Monday     300.0
Tuesday    150.0
Name: Salesmade, dtype: float64

2) What's the maximum salesmade for each day (by one sales rep)?

In [15]:
# First, group by 'Weekday',
# then get column 'Salesmade',
# then show maximum
df.groupby('Weekday')['Salesmade'].max()

Weekday
Monday     400
Tuesday    200
Name: Salesmade, dtype: int64

3) Display the output which performs the groupby using Weekday and Salesrep as below.The numeric column shows the average salesmade 

In [16]:
# First, group by 'Weekday', then group by 'Salesrep'
# then get column 'Salesmade',
# then show average
df.groupby(['Weekday', 'Salesrep'])['Salesmade'].mean()

Weekday  Salesrep
Monday   Dan         400.0
         Sam         200.0
         Wendy       300.0
Tuesday  Sam         100.0
         Wendy       200.0
Name: Salesmade, dtype: float64