In [1]:
import pandas as pd
from urllib.request import urlretrieve

In [3]:
italy_covid_url = 'https://gist.githubusercontent.com/aakashns/f6a004fa20c84fec53262f9a8bfee775/raw/f309558b1cf5103424cef58e2ecb8704dcd4d74c/italy-covid-daywise.csv'

urlretrieve(italy_covid_url, 'italy-covid-daywise.csv')

('italy-covid-daywise.csv', <http.client.HTTPMessage at 0x1b63be62c48>)

In [4]:
data = pd.read_csv(
    'italy-covid-daywise.csv',
    dtype={
        'date': 'object',
        'new_cases': 'float32',
        'new_deaths': 'float32',
        'new_tests': 'float32'
    }
)

#Specifying dtype would reduce the memory usage

#### Below you can observe the significant drop in memory usage when dtype is specified

In [5]:
pd.read_csv(
    'italy-covid-daywise.csv',
    dtype={
        'date': 'object',
        'new_cases': 'float16',
        'new_deaths': 'float16',
        'new_tests': 'float16'
    }
).memory_usage(deep=True)

Index           128
date          16616
new_cases       496
new_deaths      496
new_tests       496
dtype: int64

In [6]:
pd.read_csv(
    'italy-covid-daywise.csv',
).memory_usage(deep=True)

Index           128
date          16616
new_cases      1984
new_deaths     1984
new_tests      1984
dtype: int64

In [7]:
data.head()

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,


In [8]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float32
 2   new_deaths  248 non-null    float32
 3   new_tests   135 non-null    float32
dtypes: float32(3), object(1)
memory usage: 5.0+ KB


In [9]:
# Set the number of max columns or rows to be displayed

pd.options.display.max_columns = 10

In [10]:
data.memory_usage(deep=True)

Index           128
date          16616
new_cases       992
new_deaths      992
new_tests       992
dtype: int64

In [11]:
# Returns descriptive statistics of the data

data.describe(include='all')

Unnamed: 0,date,new_cases,new_deaths,new_tests
count,248,248.0,248.0,135.0
unique,248,,,
top,2019-12-31,,,
freq,1,,,
mean,,1094.818604,143.133072,31699.673828
std,,1554.508057,227.105545,11622.209961
min,,-148.0,-31.0,7841.0
25%,,123.0,3.0,25259.0
50%,,342.0,17.0,29545.0
75%,,1371.75,175.25,37711.0


In [12]:
# Return 5 records at random

data.sample(5)

Unnamed: 0,date,new_cases,new_deaths,new_tests
119,2020-04-28,1739.0,333.0,37554.0
203,2020-07-21,190.0,13.0,23915.0
28,2020-01-28,0.0,0.0,
66,2020-03-06,769.0,41.0,
34,2020-02-03,0.0,0.0,


### Indexing data using `loc`, `iloc` and `at`

### Using `loc` we can access data by referencing to explict index and using `iloc` we can access data by referencing to implicit index

In [13]:
data.at[147, 'new_cases']

300.0

In [14]:
data.loc[147, :]

date          2020-05-26
new_cases          300.0
new_deaths          92.0
new_tests        33944.0
Name: 147, dtype: object

In [15]:
data.iloc[147, :]

date          2020-05-26
new_cases          300.0
new_deaths          92.0
new_tests        33944.0
Name: 147, dtype: object

#### Above we were able to use 147 for `loc` as well as `iloc` because there the index/label and integer position was equal to 144

In [16]:
new_data = data.sample(5).copy()
new_data

Unnamed: 0,date,new_cases,new_deaths,new_tests
131,2020-05-10,1083.0,194.0,31384.0
66,2020-03-06,769.0,41.0,
60,2020-02-29,238.0,4.0,
77,2020-03-17,4000.0,347.0,
173,2020-06-21,264.0,49.0,24581.0


In [17]:
new_data.index

Int64Index([131, 66, 60, 77, 173], dtype='int64')

In [18]:
# We change index/label, so we will have to specify different values for loc and iloc

new_data.index = ['a', 'b', 'c', 'd', 'e']

In [19]:
new_data

Unnamed: 0,date,new_cases,new_deaths,new_tests
a,2020-05-10,1083.0,194.0,31384.0
b,2020-03-06,769.0,41.0,
c,2020-02-29,238.0,4.0,
d,2020-03-17,4000.0,347.0,
e,2020-06-21,264.0,49.0,24581.0


#### `dataframe.loc['row_label', 'column_label']` 

In [20]:
new_data.loc['a', :] # here `a` is explicit index 

date          2020-05-10
new_cases         1083.0
new_deaths         194.0
new_tests        31384.0
Name: a, dtype: object

#### `dataframe.iloc['integer_position_of_row', 'integer_position_of_column']` 

In [21]:
new_data.iloc[0, 0] # here 0 is implicit index

'2020-05-10'

#### `dataframe.at['row_label', 'column_label']` 

In [22]:
new_data.at['a', 'new_cases']

1083.0

In [23]:
new_data.head()

Unnamed: 0,date,new_cases,new_deaths,new_tests
a,2020-05-10,1083.0,194.0,31384.0
b,2020-03-06,769.0,41.0,
c,2020-02-29,238.0,4.0,
d,2020-03-17,4000.0,347.0,
e,2020-06-21,264.0,49.0,24581.0


In [24]:
# Filter data and few columns without using `dataframe.loc`

data[data.new_cases>250][['new_cases', 'new_deaths']]

Unnamed: 0,new_cases,new_deaths
62,561.0,6.0
63,347.0,17.0
64,466.0,28.0
65,587.0,27.0
66,769.0,41.0
...,...,...
243,1444.0,1.0
244,1365.0,4.0
245,996.0,6.0
246,975.0,8.0


In [25]:
# Filter data and few columns using `dataframe.loc`

data.loc[data.new_cases>250, ['new_cases', 'new_deaths']]

Unnamed: 0,new_cases,new_deaths
62,561.0,6.0
63,347.0,17.0
64,466.0,28.0
65,587.0,27.0
66,769.0,41.0
...,...,...
243,1444.0,1.0
244,1365.0,4.0
245,996.0,6.0
246,975.0,8.0


In [26]:
# Check sum of null values across columns

data.isnull().sum()

date            0
new_cases       0
new_deaths      0
new_tests     113
dtype: int64

In [27]:
data_new = data.copy()

In [28]:
data_new.isnull().sum()

date            0
new_cases       0
new_deaths      0
new_tests     113
dtype: int64

In [29]:
data_new[data_new['new_tests'].isnull()]

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
108,2020-04-17,3786.0,525.0,
109,2020-04-18,3493.0,575.0,
110,2020-04-19,3491.0,480.0,
246,2020-09-02,975.0,8.0,


In [30]:
# Here we are filling it with -1, but we can fill it with mean or something else as well

data_new.fillna(-1, inplace=True)

In [31]:
data_new[data_new.new_tests.isnull()]

Unnamed: 0,date,new_cases,new_deaths,new_tests


In [32]:
# Count of unique values across coulmns

data.nunique()

date          248
new_cases     188
new_deaths    129
new_tests     135
dtype: int64

In [33]:
# Lists all unique values in a column

data.new_tests.unique()

array([   nan,  7841., 28095., 44248., 37083., 95273., 38676., 24113.,
       26678., 37554., 38589., 41441., 43732., 31231., 27047., 22999.,
       32211., 37771., 13665., 45428., 36091., 31384., 25823., 39620.,
       37049., 41131., 39027., 40657., 33505., 26101., 40226., 38617.,
       40644., 42987., 42579., 34206., 20676., 33944., 37299., 39838.,
       38233., 36051., 28948., 18053., 25628., 20035., 27451., 40470.,
       34036., 27894., 16301., 32200., 37865., 32991., 37651., 32880.,
       29545., 17463., 27762., 33957., 32921., 28570., 29875., 24581.,
       16152., 23225., 30237., 29421., 28331., 29721., 21183., 15484.,
       28471., 29325., 29147., 25680., 28946., 21166., 13771., 22490.,
       28679., 29947., 27251., 25449., 23061., 14006., 24222., 28392.,
       28089., 28661., 27569., 20621., 14121., 23915., 29288., 33018.,
       28970., 28059., 25177., 19374., 25341., 30875., 33396., 29686.,
       31905., 24496., 13467., 23491., 29739., 32169., 30392., 26631.,
      

In [34]:
# Counts the occurence of values in a column

data.new_tests.value_counts()

7841.0     1
24222.0    1
25341.0    1
19374.0    1
25177.0    1
          ..
38233.0    1
39838.0    1
37299.0    1
33944.0    1
54395.0    1
Name: new_tests, Length: 135, dtype: int64

In [35]:
d = {
    'A': [1, 2, 3, 4, 5, 6, 1, 2, 3, 4],
    'City': ['Pune', 'Mumbai', 'Mumbai', 'Delhi', 'Bangalore', 'Pune', 'Mumbai', 'Mumbai', 'Delhi', 'Goa'],
    'B' :['A', 'B', 'C', 'W', 'P', 'A', 'O', 'Q', 'E', 'V']
}

In [36]:
dummy_data = pd.DataFrame(d)

In [37]:
dummy_data

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
4,5,Bangalore,P
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E
9,4,Goa,V


### To find records which have city that occurs more than once - `'Mumbai', 'Pune', 'Delhi'`

In [38]:
a = dummy_data.City.value_counts() 
city_constraint_list = a[a > 1].index
city_constraint_list

Index(['Mumbai', 'Pune', 'Delhi'], dtype='object')

In [39]:
dummy_data[dummy_data.City.isin(city_constraint_list)]

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E


In [40]:
dummy_data[dummy_data.groupby('City')['A'].transform('count')>1]

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E


In [41]:
import requests


In [42]:
url = "https://github.com/chris1610/pbpython/blob/master/data/sales_transactions.xlsx?raw=true"

In [43]:
sales_data = pd.read_excel(url)

In [44]:
sales_data.head()

Unnamed: 0,account,name,order,sku,quantity,unit price,ext price
0,383080,Will LLC,10001,B1-20000,7,33.69,235.83
1,383080,Will LLC,10001,S1-27722,11,21.12,232.32
2,383080,Will LLC,10001,B1-86481,3,35.99,107.97
3,412290,Jerde-Hilpert,10005,S1-06532,48,55.82,2679.36
4,412290,Jerde-Hilpert,10005,S1-82801,21,13.62,286.02


In [45]:
order_data = sales_data.groupby('order')['ext price'].sum().rename('total').reset_index()

In [46]:
order_data

Unnamed: 0,order,total
0,10001,576.12
1,10005,8185.49
2,10006,3724.49


In [47]:
new_df_merged = sales_data.merge(order_data)

In [48]:
new_df_merged

Unnamed: 0,account,name,order,sku,quantity,unit price,ext price,total
0,383080,Will LLC,10001,B1-20000,7,33.69,235.83,576.12
1,383080,Will LLC,10001,S1-27722,11,21.12,232.32,576.12
2,383080,Will LLC,10001,B1-86481,3,35.99,107.97,576.12
3,412290,Jerde-Hilpert,10005,S1-06532,48,55.82,2679.36,8185.49
4,412290,Jerde-Hilpert,10005,S1-82801,21,13.62,286.02,8185.49
5,412290,Jerde-Hilpert,10005,S1-06532,9,92.55,832.95,8185.49
6,412290,Jerde-Hilpert,10005,S1-47412,44,78.91,3472.04,8185.49
7,412290,Jerde-Hilpert,10005,S1-27722,36,25.42,915.12,8185.49
8,218895,Kulas Inc,10006,S1-27722,32,95.66,3061.12,3724.49
9,218895,Kulas Inc,10006,B1-33087,23,22.55,518.65,3724.49


In [49]:
new_df_merged['percent'] = new_df_merged['ext price'] / new_df_merged['total']

In [50]:
new_df_merged

Unnamed: 0,account,name,order,sku,quantity,unit price,ext price,total,percent
0,383080,Will LLC,10001,B1-20000,7,33.69,235.83,576.12,0.409342
1,383080,Will LLC,10001,S1-27722,11,21.12,232.32,576.12,0.403249
2,383080,Will LLC,10001,B1-86481,3,35.99,107.97,576.12,0.187409
3,412290,Jerde-Hilpert,10005,S1-06532,48,55.82,2679.36,8185.49,0.32733
4,412290,Jerde-Hilpert,10005,S1-82801,21,13.62,286.02,8185.49,0.034942
5,412290,Jerde-Hilpert,10005,S1-06532,9,92.55,832.95,8185.49,0.101759
6,412290,Jerde-Hilpert,10005,S1-47412,44,78.91,3472.04,8185.49,0.42417
7,412290,Jerde-Hilpert,10005,S1-27722,36,25.42,915.12,8185.49,0.111798
8,218895,Kulas Inc,10006,S1-27722,32,95.66,3061.12,3724.49,0.82189
9,218895,Kulas Inc,10006,B1-33087,23,22.55,518.65,3724.49,0.139254


In [51]:
order_data

Unnamed: 0,order,total
0,10001,576.12
1,10005,8185.49
2,10006,3724.49


In [52]:
sales_data.join(order_data, on='order', how='left', lsuffix='_l', rsuffix='_r')

Unnamed: 0,account,name,order_l,sku,quantity,unit price,ext price,order_r,total
0,383080,Will LLC,10001,B1-20000,7,33.69,235.83,,
1,383080,Will LLC,10001,S1-27722,11,21.12,232.32,,
2,383080,Will LLC,10001,B1-86481,3,35.99,107.97,,
3,412290,Jerde-Hilpert,10005,S1-06532,48,55.82,2679.36,,
4,412290,Jerde-Hilpert,10005,S1-82801,21,13.62,286.02,,
5,412290,Jerde-Hilpert,10005,S1-06532,9,92.55,832.95,,
6,412290,Jerde-Hilpert,10005,S1-47412,44,78.91,3472.04,,
7,412290,Jerde-Hilpert,10005,S1-27722,36,25.42,915.12,,
8,218895,Kulas Inc,10006,S1-27722,32,95.66,3061.12,,
9,218895,Kulas Inc,10006,B1-33087,23,22.55,518.65,,


In [53]:
sales_data.merge(order_data, on='order', how='inner', suffixes=('_l', '_r'))

Unnamed: 0,account,name,order,sku,quantity,unit price,ext price,total
0,383080,Will LLC,10001,B1-20000,7,33.69,235.83,576.12
1,383080,Will LLC,10001,S1-27722,11,21.12,232.32,576.12
2,383080,Will LLC,10001,B1-86481,3,35.99,107.97,576.12
3,412290,Jerde-Hilpert,10005,S1-06532,48,55.82,2679.36,8185.49
4,412290,Jerde-Hilpert,10005,S1-82801,21,13.62,286.02,8185.49
5,412290,Jerde-Hilpert,10005,S1-06532,9,92.55,832.95,8185.49
6,412290,Jerde-Hilpert,10005,S1-47412,44,78.91,3472.04,8185.49
7,412290,Jerde-Hilpert,10005,S1-27722,36,25.42,915.12,8185.49
8,218895,Kulas Inc,10006,S1-27722,32,95.66,3061.12,3724.49
9,218895,Kulas Inc,10006,B1-33087,23,22.55,518.65,3724.49


## Merge and Join in Pandas

In [54]:
user_usage = pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/user_usage.csv')
android_devices = pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/android_devices.csv')
user_device = pd.read_csv('https://raw.githubusercontent.com/shanealynn/Pandas-Merge-Tutorial/master/user_device.csv')

In [55]:
user_usage.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id
0,21.97,4.82,1557.33,22787
1,1710.08,136.88,7267.55,22788
2,1710.08,136.88,7267.55,22789
3,94.46,35.17,519.12,22790
4,71.59,79.26,1557.33,22792


In [56]:
android_devices.head()

Unnamed: 0,Retail Branding,Marketing Name,Device,Model
0,,,AD681H,Smartfren Andromax AD681H
1,,,FJL21,FJL21
2,,,T31,Panasonic T31
3,,,hws7721g,MediaPad 7 Youth 2
4,3Q,OC1020A,OC1020A,OC1020A


In [57]:
android_devices.Model.value_counts()

LG-P970               23
LG-P920               20
LG-P990               20
LG-E510               19
LG-P690               14
                      ..
Thunder 347            1
KAZAM Thunder 350L     1
KAZAM Thunder 550      1
KAZAM Thunder 550L     1
EASY1                  1
Name: Model, Length: 12883, dtype: int64

In [58]:
user_device.head()

Unnamed: 0,use_id,user_id,platform,platform_version,device,use_type_id
0,22782,26980,ios,10.2,"iPhone7,2",2
1,22783,29628,android,6.0,Nexus 5,3
2,22784,28473,android,5.1,SM-G903F,1
3,22785,15200,ios,10.2,"iPhone7,2",3
4,22786,28239,android,6.0,ONE E1003,1


In [59]:
user_device.device.value_counts()

SM-G900F                  32
iPhone7,2                 27
iPhone6,2                 22
GT-I9505                  13
ONEPLUS A3003              9
                          ..
Nexus 5                    1
HTC Desire 530             1
E6653                      1
D5803                      1
Vodafone Smart ultra 6     1
Name: device, Length: 72, dtype: int64

`user_device` and `user_usage` can be merged on `use_id` column, `user_device` and `android_devices` can be merged on `Model` and `device` column as these 2 columns hold similar values

**Problem Statement** - We would like to determine if the usage patterns for users differ between different devices. For example, do users using Samsung devices use more call minutes than those using  LG devices? 

In [60]:
merge1 = pd.merge(user_usage,
                  user_device[['use_id', 'platform', 'device']],
                  on='use_id') # by default inner join is used
merge1.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
0,21.97,4.82,1557.33,22787,android,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F


In [61]:
# count of `use_id` present in both user_usage and user_device = 159, below code can validate that
user_usage['use_id'].isin(user_device['use_id']).value_counts()

True     159
False     81
Name: use_id, dtype: int64

In [62]:
# merged df should also have 159 records (rows)
merge1.shape

(159, 6)

In [63]:
# Try outer join
outer_merge = pd.merge(user_usage,
                       user_device[['use_id', 'platform', 'device']],
                       on='use_id',
                       how='outer',
                      indicator=True)

# Using `indicator` a new column `_merge` is added which tells from where did that row come, left df, righ df or both
outer_merge.sample(5)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device,_merge
137,101.59,84.41,5191.12,23018,android,Moto G (4),both
185,300.3,11.66,3251.33,23676,,,left_only
255,,,,22811,ios,"iPad3,1",right_only
284,,,,22872,android,GT-I9300,right_only
0,21.97,4.82,1557.33,22787,android,GT-I9505,both


In [64]:
# Solving the problem statement
merge1 = pd.merge(user_usage,
                   user_device[['use_id', 'platform', 'device']],
                   on='use_id',
                   how='left')

merge1.head()

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,platform,device
0,21.97,4.82,1557.33,22787,android,GT-I9505
1,1710.08,136.88,7267.55,22788,android,SM-G930F
2,1710.08,136.88,7267.55,22789,android,SM-G930F
3,94.46,35.17,519.12,22790,android,D2303
4,71.59,79.26,1557.33,22792,android,SM-G361F


In [65]:
merge1.shape, user_usage.shape

((240, 6), (240, 4))

In [66]:
android_devices.rename(columns={'Retail Branding': 'Manufacturer'}, inplace=True)

android_devices.columns

Index(['Manufacturer', 'Marketing Name', 'Device', 'Model'], dtype='object')

In [67]:
android_devices.head()

Unnamed: 0,Manufacturer,Marketing Name,Device,Model
0,,,AD681H,Smartfren Andromax AD681H
1,,,FJL21,FJL21
2,,,T31,Panasonic T31
3,,,hws7721g,MediaPad 7 Youth 2
4,3Q,OC1020A,OC1020A,OC1020A


In [68]:
del merge2

NameError: name 'merge2' is not defined

In [None]:
merge2 = pd.merge(merge1,
                  android_devices[['Manufacturer', 'Model', 'Marketing Name']],
                  how='left',
                  left_on='device',
                  right_on='Model')

merge2.head()

In [None]:
merge2.groupby("Manufacturer").agg({
    "outgoing_mins_per_month": "mean",
    "outgoing_sms_per_month": "mean",
    "monthly_mb": "mean",
    "use_id": "count"
})

References -<br> https://pbpython.com/pandas_transform.html <br>
https://towardsdatascience.com/when-to-use-pandas-transform-function-df8861aa0dcf <br>
https://drawingfromdata.com/pandas/grouping/pandas-groupby-transform-aggregate-filter.html <br>
https://www.datacamp.com/community/tutorials/joining-dataframes-pandas <br>
https://www.analyticsvidhya.com/blog/2020/02/joins-in-pandas-master-the-different-types-of-joins-in-python/ <br>
https://www.shanelynn.ie/merge-join-dataframes-python-pandas-index-1/ <br>

## Data School - Youtube channel

#### When to use groupby

In [70]:
drinks_data = pd.read_csv("http://bit.ly/drinksbycountry")

In [71]:
drinks_data.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [88]:
# What is the avergae beer servings across all continents

drinks_data.groupby('continent').agg({'beer_servings': 'mean'})

Unnamed: 0_level_0,beer_servings
continent,Unnamed: 1_level_1
Africa,61.471698
Asia,37.045455
Europe,193.777778
North America,145.434783
Oceania,89.6875
South America,175.083333


In [89]:
drinks_data.groupby('continent')['beer_servings'].mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [91]:
drinks_data.groupby('continent')['beer_servings'].agg(['mean', 'max', 'min', 'count'])

# Count is specifying how many countries are there in africa

Unnamed: 0_level_0,mean,max,min,count
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,61.471698,376,0,53
Asia,37.045455,247,0,44
Europe,193.777778,361,0,45
North America,145.434783,285,1,23
Oceania,89.6875,306,0,16
South America,175.083333,333,93,12


In [92]:
# If you don't specify coulmn then it would calculate mean across all numeric columns
drinks_data.groupby('continent').mean()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,61.471698,16.339623,16.264151,3.007547
Asia,37.045455,60.840909,9.068182,2.170455
Europe,193.777778,132.555556,142.222222,8.617778
North America,145.434783,165.73913,24.521739,5.995652
Oceania,89.6875,58.4375,35.625,3.38125
South America,175.083333,114.75,62.416667,6.308333


### Map in Pandas
Map can only be used with Series, where we can `dict` and `Series` as parameters. 

In [109]:
d = {v:i for i, v in enumerate(drinks_data['continent'].unique())}

In [104]:
d

{'Asia': 0,
 'Europe': 1,
 'Africa': 2,
 'North America': 3,
 'South America': 4,
 'Oceania': 5}

In [105]:
continent_series = pd.Series(d)

continent_series

Asia             0
Europe           1
Africa           2
North America    3
South America    4
Oceania          5
dtype: int64

In [107]:
# The value taht we want to map, in this case the Continents should the index of dataframe.
drinks_data['continent_num'] = drinks_data['continent'].map(continent_series)

In [108]:
drinks_data.loc[0:4, ['continent', 'continent_num']]

Unnamed: 0,continent,continent_num
0,Asia,0
1,Europe,1
2,Africa,2
3,Europe,1
4,Africa,2


In [126]:
# Same thing can be done by passing a dictionary, where key should be the value we want to map
drinks_data['continent'].map(d)

0      0
1      1
2      2
3      1
4      2
      ..
188    4
189    0
190    0
191    2
192    2
Name: continent, Length: 193, dtype: int64

In [127]:
drinks_data['continent'].map(len)

0       4
1       6
2       6
3       6
4       6
       ..
188    13
189     4
190     4
191     6
192     6
Name: continent, Length: 193, dtype: int64

In [128]:
drinks_data['continent'].apply(len)

0       4
1       6
2       6
3       6
4       6
       ..
188    13
189     4
190     4
191     6
192     6
Name: continent, Length: 193, dtype: int64