## EDA with Pandas part 2

In [1]:
import pandas as pd
import os
import numpy as np
from pathlib import Path

In [2]:
# show the contents of the data directory
data_dir = Path.joinpath(Path.cwd().parent, 'data')
os.listdir(data_dir)

['Production.ProductSubcategory.csv',
 'drinks.csv',
 'imdb_1000.csv',
 'ufo.csv',
 'Sales.SalesOrderHeader.csv',
 'titanic.csv',
 'u.user',
 'Sales.SalesOrderDetail.csv',
 'Production.Product.csv']

In [4]:
# read in the dataset
path2data=Path.joinpath(data_dir, 'drinks.csv')
drinks = pd.read_csv(path2data)
drinks.sample(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
32,Canada,240,122,100,8.2,
2,Algeria,25,0,14,0.7,AF
110,Micronesia,62,50,18,2.3,OC


In [5]:
# how big is my df?
drinks.shape

(193, 6)

In [6]:
# what are the column types?
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [8]:
# quick reminder about the two ways to call a column in Pandas

# best practice
drinks['country'].value_counts(ascending=False).head()

Swaziland      1
Togo           1
Philippines    1
Madagascar     1
Costa Rica     1
Name: country, dtype: int64

In [9]:
# dot notation
drinks.country.value_counts(ascending=False).head()

Swaziland      1
Togo           1
Philippines    1
Madagascar     1
Costa Rica     1
Name: country, dtype: int64

In [12]:
# formatted or to select 2 variables
drinks[['country', 'beer_servings']].head()

Unnamed: 0,country,beer_servings
0,Afghanistan,0
1,Albania,89
2,Algeria,25
3,Andorra,245
4,Angola,217


## Exploration

In [18]:
# filter to include only European countries
drinks[drinks['continent']=='OC'].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
8,Australia,261,72,212,10.4,OC
40,Cook Islands,0,254,74,5.9,OC
59,Fiji,77,35,1,2.0,OC
89,Kiribati,21,34,1,1.0,OC
106,Marshall Islands,0,0,0,0.0,OC


In [23]:
# double filter: Only European counties with high wine consumption.
drinks[(drinks['continent']=='EU') & (drinks['wine_servings']>270)].tail()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
61,France,127,151,370,11.8,EU
99,Luxembourg,236,133,271,11.4,EU
136,Portugal,194,67,339,11.0,EU
156,Slovenia,270,51,276,10.6,EU
166,Switzerland,185,100,280,10.2,EU


In [28]:
# what's the average beer consumption for all Europe?
drinks['beer_servings'].mean() # whole world
eu_beer = drinks[(drinks['continent']=='EU')]['beer_servings'].mean() #EU
round(eu_beer, 3) # rounded

193.778

In [36]:
# which 5 countries have the highest pure alcohol consumption?
list(drinks.sort_values(by='total_litres_of_pure_alcohol', ascending=False)['country'].head(5).values)

['Belarus', 'Lithuania', 'Andorra', 'Grenada', 'Czech Republic']

## Column Activities

In [43]:
# rename columns
drinks.rename(columns={'beer_servings':'beer', 'wine_servings':'wine'}, inplace=True)
drinks = drinks.rename(columns={'beer_servings':'beer', 'wine_servings':'wine'}).copy()
drinks.head()

Unnamed: 0,country,beer,spirit_servings,wine,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [50]:
# what if I just want to replace all?
new_cols=['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks = pd.read_csv(path2data, header=0, names=new_cols)
drinks.head(3)

Unnamed: 0,country,beer,spirit,wine,liters,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


In [49]:
# you can also replace column names without re-reading the dataset
drinks.columns
new_list=['a', 'b', 'c', 'd', 'e', 'f']
drinks.columns=new_list
drinks.head(3)

Unnamed: 0,a,b,c,d,e,f
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


In [51]:
# what if I want to remove some columns?
drinks.head(3)

Unnamed: 0,country,beer,spirit,wine,liters,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


In [54]:
# use the drop method with 'axis'
drinks = drinks.drop('liters', axis=1)
drinks.sample(3)

Unnamed: 0,country,beer,spirit,wine,continent
34,Chad,15,1,1,AF
185,Uruguay,115,35,220,SA
46,North Korea,0,0,0,AS


In [56]:
# drop multiple columns
drinks.drop(['beer', 'spirit', 'continent'], axis=1)

Unnamed: 0,country,wine
0,Afghanistan,0
1,Albania,54
2,Algeria,14
3,Andorra,312
4,Angola,45
5,Antigua & Barbuda,45
6,Argentina,221
7,Armenia,11
8,Australia,212
9,Austria,191


## Missing Values!

In [57]:
# does my dataframe have missing values?
drinks.isnull().sum()

country       0
beer          0
spirit        0
wine          0
continent    23
dtype: int64

In [60]:
# what are they?
drinks['continent'].value_counts(dropna=False)

AF     53
EU     45
AS     44
NaN    23
OC     16
SA     12
Name: continent, dtype: int64

In [61]:
# just a reminder: notnull()
drinks.notnull().sum()

country      193
beer         193
spirit       193
wine         193
continent    170
dtype: int64

In [67]:
# let's look at the rows with missing
drinks[drinks['continent']=='EU']
drinks[drinks['continent'].isnull()].head(3)

Unnamed: 0,country,beer,spirit,wine,continent
5,Antigua & Barbuda,102,128,45,
11,Bahamas,122,176,51,
14,Barbados,143,173,36,


In [119]:
# why is it North America?
drinks = pd.read_csv(path2data, header=0, names=new_cols, na_filter=True)
drinks.sample(10)

Unnamed: 0,country,beer,spirit,wine,liters,continent
44,Cyprus,192,154,113,8.2,EU
177,Turkmenistan,19,71,32,2.2,AS
13,Bangladesh,0,0,0,0.0,AS
134,Philippines,71,186,1,4.6,AS
138,South Korea,140,16,9,9.8,AS
85,Japan,77,202,16,7.0,AS
82,Israel,63,69,9,2.5,AS
10,Azerbaijan,21,46,5,1.3,EU
2,Algeria,25,0,14,0.7,AF
126,Norway,169,71,129,6.7,EU


In [74]:
# what if we just drop our missing data?
print(drinks.shape)
drinks.dropna(inplace=True)
print(drinks.shape)

(193, 6)
(170, 6)


In [76]:
drinks.dropna(how='all') # drops a row if ENTIRE row is missing
drinks.dropna(how='any') # drops a row if ANY CELL of row is missing

Unnamed: 0,country,beer,spirit,wine,liters,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU
10,Azerbaijan,21,46,5,1.3,EU


In [82]:
# much more useful is to fill NA with a reasonable value
drinks['continent'].fillna(value='NA', inplace=True)
drinks.sample(10)

Unnamed: 0,country,beer,spirit,wine,liters,continent
104,Mali,5,1,1,0.6,AF
176,Turkey,51,22,7,1.4,AS
109,Mexico,238,68,5,5.5,
63,Gambia,8,0,1,2.4,AF
159,South Africa,225,76,81,8.2,AF
142,Rwanda,43,2,0,6.8,AF
89,Kiribati,21,34,1,1.0,OC
66,Ghana,31,3,10,1.8,AF
172,Togo,36,2,19,1.3,AF
121,New Zealand,203,79,175,9.3,OC


## Groupby techniques

In [85]:
# calculate average beer by continent
drinks['beer'].mean()
drinks.groupby('continent')['beer'].mean()

continent
AF     61.471698
AS     37.045455
EU    193.777778
NA    145.434783
OC     89.687500
SA    175.083333
Name: beer, dtype: float64

In [86]:
# you can remove the numeric var
drinks.groupby('continent').mean()

Unnamed: 0_level_0,beer,spirit,wine,liters
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,61.471698,16.339623,16.264151,3.007547
AS,37.045455,60.840909,9.068182,2.170455
EU,193.777778,132.555556,142.222222,8.617778
,145.434783,165.73913,24.521739,5.995652
OC,89.6875,58.4375,35.625,3.38125
SA,175.083333,114.75,62.416667,6.308333


In [93]:
# you can groupby using multiple aggregation methods
drinks.groupby('continent')['beer'].mean()
drinks.groupby('continent')['beer'].median()
drinks.groupby('continent')['beer'].min()
drinks.groupby('continent')['beer'].max()
drinks.groupby('continent')['beer'].std()
drinks.groupby('continent')['beer'].count()
drinks.groupby('continent')['beer'].quantile(.25)
drinks.groupby('continent')['beer'].agg(['mean', 'median', 'min'])

Unnamed: 0_level_0,mean,median,min
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,61.471698,32.0,0
AS,37.045455,17.5,0
EU,193.777778,219.0,0
,145.434783,143.0,1
OC,89.6875,52.5,0
SA,175.083333,162.5,93


In [97]:
# this becomes its own dataframe and pandas methods can be called on it
shortdf = drinks.groupby('continent')['beer'].agg(['mean', 'median', 'min'])
shortdf.sort_values('mean')

Unnamed: 0_level_0,mean,median,min
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AS,37.045455,17.5,0
AF,61.471698,32.0,0
OC,89.6875,52.5,0
,145.434783,143.0,1
SA,175.083333,162.5,93
EU,193.777778,219.0,0


In [99]:
# you can group by multiple categorical variables
drinks.groupby(['continent', 'country'])['beer'].mean().head(2)

continent  country
AF         Algeria     25
           Angola     217
Name: beer, dtype: int64

In [100]:
# you can group on multiple continuous variables
drinks.groupby('continent')['beer', 'wine'].mean()

Unnamed: 0_level_0,beer,wine
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
AF,61.471698,16.264151
AS,37.045455,9.068182
EU,193.777778,142.222222
,145.434783,24.521739
OC,89.6875,35.625
SA,175.083333,62.416667


## A few other common features of pandas

In [104]:
# change the data type of a column
drinks['beer'].head()
print(drinks['beer'].dtype)
drinks['liters'].dtype

int64


dtype('float64')

In [106]:
# change int to float
drinks['beer'] = drinks['beer'].astype('float')
drinks['beer'].astype('float', inplace=True)
drinks['beer'].dtype

dtype('float64')

In [109]:
# what if I want it to be a string?
drinks['beerstring'] = drinks['beer'].astype('str')
drinks.head()

Unnamed: 0,country,beer,spirit,wine,liters,continent,beerstring
0,Afghanistan,0.0,0,0,0.0,AS,0.0
1,Albania,89.0,132,54,4.9,EU,89.0
2,Algeria,25.0,0,14,0.7,AF,25.0
3,Andorra,245.0,138,312,12.4,EU,245.0
4,Angola,217.0,57,45,5.9,AF,217.0


## remap valuesm

In [114]:
# map method
drinks['new_continent'] = drinks['continent'].map(
                                                    {'AS':'Asia',
                                                    'EU':'Europe',
                                                    'AF':'Africa',
                                                    'SA':'South America',
                                                    'NA':'North America',
                                                    'OC':'Oceania'}
                                                    )
# good for categorical variables with not too many values

In [121]:
# using .loc with bins
drinks['beer_level']='low'
drinks.loc[drinks['beer'].between(101,200), 'beer_level'] = 'med'
drinks.loc[drinks['beer'].between(201,400), 'beer_level'] = 'high'
drinks.head()
drinks['beer_level'].value_counts()

low     114
high     40
med      39
Name: beer_level, dtype: int64

In [125]:
# yet another method for recoding variables
# good for replacing a single value
drinks['continent'].value_counts()
# drinks['newcont'] = drinks['continent'].map({'OC':'SP'})
drinks['newcont'] = drinks['continent'].replace('OC', 'SP')
drinks.sample(10)

Unnamed: 0,country,beer,spirit,wine,liters,continent,beer_level,newcont
46,North Korea,0,0,0,0.0,AS,low,AS
28,Cote d'Ivoire,37,1,7,4.0,AF,low,AF
107,Mauritania,0,0,0,0.0,AF,low,AF
58,Ethiopia,20,3,0,0.7,AF,low,AF
53,Egypt,6,4,1,0.2,AF,low,AF
47,DR Congo,32,3,1,2.3,AF,low,AF
8,Australia,261,72,212,10.4,OC,high,SP
140,Romania,297,122,167,10.4,EU,high,EU
111,Monaco,0,0,0,0.0,EU,low,EU
154,Singapore,60,12,11,1.5,AS,low,AS


In [127]:
# last method: good for binary outputs
import numpy as np
drinks['is_in_Africa'] = np.where(drinks['continent']=='AF', 'yes', 'no')
drinks.head()

Unnamed: 0,country,beer,spirit,wine,liters,continent,beer_level,newcont,is_in_Africa
0,Afghanistan,0,0,0,0.0,AS,low,AS,no
1,Albania,89,132,54,4.9,EU,low,EU,no
2,Algeria,25,0,14,0.7,AF,low,AF,yes
3,Andorra,245,138,312,12.4,EU,high,EU,no
4,Angola,217,57,45,5.9,AF,high,AF,yes
