https://www.datacamp.com/courses/manipulating-dataframes-with-pandas
# 1. Categoricals and groupby

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [2]:
sales = pd.DataFrame(
{
    'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
    'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
    'bread': [139, 237, 326, 456],
    'butter': [20, 45, 70, 98]
    
}
)
sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


## 1) df.groupby('xx')

In [3]:
sales.loc[sales.weekday=='Sun']

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45


In [4]:
sales.loc[sales.weekday=='Sun'].count()

weekday    2
city       2
bread      2
butter     2
dtype: int64

In [5]:
sales.groupby('weekday').count()

Unnamed: 0_level_0,city,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [6]:
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

In [7]:
sales.groupby('weekday')[['bread', 'butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,376,65


In [8]:
sales.groupby(['city', 'weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


In [9]:
customers = pd.Series(['Dave', 'Alice', 'Bob', 'Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

In [10]:
sales.groupby(customers)['bread'].sum()

Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

## 2) Categorical data: use less memory and speed up operations like groupby( )

In [11]:
sales.weekday.unique()

array(['Sun', 'Mon'], dtype=object)

In [12]:
sales.weekday = sales['weekday'].astype('category')
sales.weekday

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

# Practice 1
- 'embarked' column indicates at which of the three ports the passenger boarded the Titanic. 'S' stands for Southampton, England, 'C' for Cherbourg, France and 'Q' for Queenstown, Ireland.

In [13]:
tan = pd.read_csv('datasets/titanic.csv')
tan.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [14]:
byclass = tan.groupby('pclass').count()
byclass

Unnamed: 0_level_0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,323,323,323,284,323,323,323,323,256,321,201,35,289
2,277,277,277,261,277,277,277,277,23,277,112,31,261
3,709,709,709,501,709,709,709,708,16,709,173,55,195


In [15]:
tan.groupby('pclass')['survived'].count()

pclass
1    323
2    277
3    709
Name: survived, dtype: int64

In [16]:
bymultiple = tan.groupby(['embarked', 'pclass']).count()
bymultiple

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,boat,body,home.dest
embarked,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C,1,141,141,141,128,141,141,141,141,111,98,14,121
C,2,28,28,28,24,28,28,28,28,4,15,2,27
C,3,101,101,101,60,101,101,101,101,3,36,9,24
Q,1,3,3,3,3,3,3,3,3,3,2,1,3
Q,2,7,7,7,5,7,7,7,7,1,2,0,4
Q,3,113,113,113,42,113,113,113,113,1,34,6,30
S,1,177,177,177,151,177,177,177,177,140,99,20,164
S,2,242,242,242,232,242,242,242,242,18,95,29,230
S,3,495,495,495,399,495,495,495,494,12,103,40,141


In [17]:
tan.groupby(['embarked', 'pclass'])['survived'].count()

embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64

In [18]:
life = pd.read_csv('datasets/gapminder_life.csv', index_col='Country')
life.head(3)

Unnamed: 0_level_0,Year,fertility,life,population,child_mortality,gdp
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0
Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0
Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0


In [19]:
regions = pd.read_csv('datasets/gapminder_regions.csv', index_col='Country')
regions.head(3)

Unnamed: 0_level_0,region
Country,Unnamed: 1_level_1
Afghanistan,South Asia
Afghanistan,South Asia
Afghanistan,South Asia


In [20]:
life.groupby(regions['region']).mean()

Unnamed: 0_level_0,Year,fertility,life,population,child_mortality,gdp
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
America,1988.5,3.486061,68.722251,17745720.0,50.513292,11599.921875
East Asia & Pacific,1988.510931,3.725836,66.108632,54686190.0,59.337826,13336.156923
Europe & Central Asia,1988.550781,2.214177,71.931303,16003580.0,30.180168,18442.045417
Middle East & North Africa,1988.5,4.970019,65.194301,11713030.0,69.884533,27510.731579
South Asia,1988.5,5.004162,57.13771,140678200.0,137.76715,2552.65
Sub-Saharan Africa,1988.5,5.956105,51.664426,10509980.0,158.917473,3152.428511


# 2. Groupby & aggregation

In [21]:
sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


In [22]:
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


In [23]:
sales.groupby('city')[['bread', 'butter']].agg(['max', 'sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,465,70,90
Dallas,456,693,98,143


In [24]:
def datarange(yes):
    return yes.max() - yes.min()
sales.groupby('weekday')[['bread', 'butter']].agg(datarange)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,98,25


In [25]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread': 'sum', 'butter': datarange})

Unnamed: 0,bread,butter
Alice,693,53
Bob,326,0
Dave,139,0


# Practice 2

In [26]:
anything = tan.groupby('pclass')[['age', 'fare']].agg(['max', 'median'])
anything

Unnamed: 0_level_0,age,age,fare,fare
Unnamed: 0_level_1,max,median,max,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,80.0,39.0,512.3292,60.0
2,70.0,29.0,73.5,15.0458
3,74.0,24.0,69.55,8.05


In [27]:
anything.loc[:, ('age', 'max')]

pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64

In [28]:
anything.loc[:, ('fare', 'median')]

pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64

### Compute the total population, spread of per capita GDP values and average child mortality rate.

In [29]:
gap = pd.read_csv('datasets/gapminder_tidy.csv', index_col=['Year', 'region', 'Country']).sort_index()
gap.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fertility,life,population,child_mortality,gdp
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964,America,Antigua and Barbuda,4.25,63.775,58653.0,72.78,5008.0
1964,America,Argentina,3.068,65.388,21966478.0,57.43,8227.0
1964,America,Aruba,4.059,67.113,57031.0,,5505.0


In [30]:
def a(b):
    return b.max() - b.min()
see = {'population':'sum', 'child_mortality':'mean', 'gdp': a}

In [31]:
gap.groupby(level=['Year', 'region']).agg(see).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,child_mortality,gdp
Year,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1964,America,462195700.0,113.950667,18314.0
1964,East Asia & Pacific,1110668000.0,129.10913,66821.0
1964,Europe & Central Asia,698854500.0,61.585319,28734.0


### df.index.strftime('%a'): to transform the index datetime values to abbreviated days of the week.

In [32]:
techsale = pd.read_csv('datasets/sales/sales-feb-2015.csv', parse_dates=True, index_col='Date')
techsale.head()

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


In [33]:
techsale.groupby(techsale.index.strftime('%a'))['Units'].sum()

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64

# 3. Groupby & transformation
- zscore: its distance from the mean of its population measured in units of standard deviation.

In [34]:
def a(b):
    return (b - b.mean())/b.std()

In [35]:
auto = pd.read_csv('datasets/auto-mpg.csv')
auto.head(3)

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,color,size
0,18.0,6,250.0,88,3139,14.5,71,US,ford mustang,blue,15.0
1,9.0,8,304.0,193,4732,18.5,70,US,hi 1200d,blue,20.0
2,36.1,4,91.0,60,1800,16.4,78,Asia,honda civic cvcc,red,10.0


### The second row of -1.85 means the mpg rating for 'hi 1200d' is more than 1 (close to 2) standard deviation below the mean computed over all the automobiles listed from 1970 to 1982.

In [36]:
a(auto.mpg).head()

0   -0.697747
1   -1.850853
2    1.621277
3   -0.633685
4    1.390656
Name: mpg, dtype: float64

### Normalized by year, the mpg rating for 'hi 1200d' is now only about 1.6 standard deviation below average amongst cars manufactured in 1970.

In [37]:
# mpg z-score by year
auto.groupby('yr')['mpg'].transform(a).head()

0   -0.466040
1   -1.627511
2    1.745261
3   -0.730243
4    0.072075
Name: mpg, dtype: float64

In [38]:
def c(d):
    df = pd.DataFrame({
        'mpg': a(d['mpg']),
        'year': d['yr'],
        'name': d['name']
    })
    return df

In [39]:
auto.groupby('yr').apply(c).head()

Unnamed: 0,mpg,year,name
0,-0.46604,71,ford mustang
1,-1.627511,70,hi 1200d
2,1.745261,78,honda civic cvcc
3,-0.730243,77,ford granada
4,0.072075,80,audi 4000


# Practice 3
### The z-score is also useful to find outliers: a z-score value of +/- 3 is generally considered to be an outlier.

In [40]:
e = gap.groupby('region')['life', 'fertility'].transform(zscore)
e.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,life,fertility
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1
1964,America,Antigua and Barbuda,-0.729776,0.525844
1964,America,Argentina,-0.49184,-0.287766
1964,America,Aruba,-0.237383,0.394373


In [41]:
# to identify outliers
gap.loc[(e['life'] < -3)|(e['fertility'] > 3)].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fertility,life,population,child_mortality,gdp
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964,America,Bolivia,6.607,43.913,3668568.0,265.4,2971.0
1964,America,Guatemala,6.434,47.884,4636016.0,201.5,3722.0
1964,America,Haiti,6.233,44.464,4188276.0,273.0,2024.0
1964,East Asia & Pacific,Timor-Leste,6.347,35.724,537278.0,318.95,701.0
1964,Europe & Central Asia,Turkey,6.029,48.313,31109820.0,221.9,5296.0


### Filling missing data (imputation) by group

In [42]:
def ab(cd):
    return cd.fillna(cd.median())
tan.groupby(['sex', 'pclass'])['age'].transform(ab).head()

0    29.00
1     0.92
2     2.00
3    30.00
4    25.00
Name: age, dtype: float64

In [43]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

gap.groupby('region').apply(disparity).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z(gdp),regional spread(gdp)
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1
1964,America,Antigua and Barbuda,-0.715944,49764.0
1964,America,Argentina,-0.366331,49764.0
1964,America,Aruba,-0.661965,49764.0
1964,America,Bahamas,0.712486,49764.0
1964,America,Barbados,-0.64285,49764.0


# 4. Groupby and filtering
We groupby all automobiles by year, select the mpg column, and compute the average over each year.

In [44]:
auto.groupby('yr')['mpg'].mean().head()

yr
70    17.689655
71    21.111111
72    18.714286
73    17.100000
74    22.769231
Name: mpg, dtype: float64

In [45]:
split = auto.groupby('yr')
type(split)

pandas.core.groupby.generic.DataFrameGroupBy

In [46]:
type(split.groups)

dict

In [47]:
split.groups.keys()

dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

# Practice 4
### 1) Calculate the mean survival rates on the 'C' deck:

In [51]:
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    return gr.loc[c_passengers, 'survived'].mean()
tan.groupby('sex').apply(c_deck_survival)

sex
female    0.913043
male      0.312500
dtype: float64

### 2) To find out what fraction of children under 10 survived in each 'pclass'

In [62]:
under10 = (tan.age < 10).map({True: 'under 10', False: 'over 10'})
under10.head()

0     over 10
1    under 10
2    under 10
3     over 10
4     over 10
Name: age, dtype: object

In [60]:
tan.groupby(under10).survived.mean()

age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64

In [61]:
tan.groupby([under10, 'pclass'])['survived'].mean()

age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64

### 3) Grouping and filtering with .filter( )

In [53]:
techsale.groupby('Company')['Units'].sum()

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64

In [56]:
techsale.groupby('Company').filter(lambda f:f['Units'].sum() > 35)

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-09 09:00:00,Streeplex,Service,19
2015-02-09 13:00:00,Mediacore,Software,7
2015-02-19 11:00:00,Mediacore,Hardware,16
2015-02-19 16:00:00,Mediacore,Service,10
2015-02-21 05:00:00,Mediacore,Software,3
2015-02-26 09:00:00,Streeplex,Service,4
