# Py: Tidy data analysis - Penguins

In [None]:
# Loading libraries 
import datatable as dt
from datatable import f,by,count,update,sort
import altair as alt
import pandas as pd
import numpy as np

In [2]:
# Confifuring a set of DT options
dt.init_styles()
dt.options.display.head_nrows=4
dt.options.display.tail_nrows=4

In [3]:
# Importing data from a github source
penguins_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-28/penguins.csv')

In [4]:
# Glance at data
penguins_dt

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Adelie,Torgersen,39.1,18.7,181,3750,male,2007
1,Adelie,Torgersen,39.5,17.4,186,3800,female,2007
2,Adelie,Torgersen,40.3,18,195,3250,female,2007
3,Adelie,Torgersen,,,,,,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
340,Chinstrap,Dream,43.5,18.1,202,3400,female,2009
341,Chinstrap,Dream,49.6,18.2,193,3775,male,2009
342,Chinstrap,Dream,50.8,19,210,4100,male,2009
343,Chinstrap,Dream,50.2,18.7,198,3775,female,2009


In [5]:
# Check datatypes of DT columns
penguins_dt.stypes

(stype.str32,
 stype.str32,
 stype.float64,
 stype.float64,
 stype.int32,
 stype.int32,
 stype.str32,
 stype.int32)

In [6]:
# Check any NA's acroos all columns
penguins_dt.countna()

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,0,0,2,2,2,2,11,0


In [7]:
# Look at the number of unique values per column
penguins_dt.nunique()

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,3,3,164,80,55,94,2,3


In [8]:
# Displaying DT  names and their types
for cname,ctype in zip(penguins_dt.names,penguins_dt.stypes):
    print(f'{cname}- is a type of: {ctype} ')

species- is a type of: stype.str32 
island- is a type of: stype.str32 
bill_length_mm- is a type of: stype.float64 
bill_depth_mm- is a type of: stype.float64 
flipper_length_mm- is a type of: stype.int32 
body_mass_g- is a type of: stype.int32 
sex- is a type of: stype.str32 
year- is a type of: stype.int32 


In [9]:
# First five observations from 2 to 5 columns in DT
penguins_dt[:5,2:6]

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,,,,
4,36.7,19.3,193.0,3450.0


In [10]:
# Last five observations from DT
penguins_dt[-5:,:]

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Chinstrap,Dream,55.8,19.8,207,4000,male,2009
1,Chinstrap,Dream,43.5,18.1,202,3400,female,2009
2,Chinstrap,Dream,49.6,18.2,193,3775,male,2009
3,Chinstrap,Dream,50.8,19.0,210,4100,male,2009
4,Chinstrap,Dream,50.2,18.7,198,3775,female,2009


In [11]:
# All observations for last 3 columns
penguins_dt[:,-3:]

Unnamed: 0_level_0,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,3750,male,2007
1,3800,female,2007
2,3250,female,2007
3,,,2007
⋮,⋮,⋮,⋮
340,3400,female,2009
341,3775,male,2009
342,4100,male,2009
343,3775,female,2009


In [12]:
# Filter out NA's from sex and body mass g columns
penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)),:]

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Adelie,Torgersen,34.1,18.1,193,3475,,2007
1,Adelie,Torgersen,42.0,20.2,190,4250,,2007
2,Adelie,Torgersen,37.8,17.1,186,3300,,2007
3,Adelie,Torgersen,37.8,17.3,180,3700,,2007
4,Adelie,Dream,37.5,18.9,179,2975,,2007
5,Gentoo,Biscoe,44.5,14.3,216,4100,,2007
6,Gentoo,Biscoe,46.2,14.4,214,4650,,2008
7,Gentoo,Biscoe,47.3,13.8,216,4725,,2009
8,Gentoo,Biscoe,44.5,15.7,217,4875,,2009


In [146]:
# mean of all numerics columns per different penguin sex categories
penguins_dt[~dt.isna(f.sex),:
           ][:,dt.mean((f[dt.int32].remove(f.year),f[dt.float64])),by(f.sex)]

Unnamed: 0_level_0,sex,flipper_length_mm,body_mass_g,bill_length_mm,bill_depth_mm
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,female,197.364,3862.27,42.097,16.4255
1,male,204.506,4545.68,45.8548,17.8911


In [14]:
# step - 1 : finding a max value of body_mass of penguins per sex
penguins_dt[:,update(temp=f.body_mass_g==dt.max(f.body_mass_g)),by(f.sex)]

In [15]:
# step - 2 : finding a max value of body_mass of penguins per sex
penguins_dt[f.temp==1,f[:].remove(f.temp)]

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Gentoo,Biscoe,49.2,15.2,221,6300,male,2007
1,Gentoo,Biscoe,46.5,14.8,217,5200,female,2008
2,Gentoo,Biscoe,44.5,15.7,217,4875,,2009
3,Gentoo,Biscoe,45.2,14.8,212,5200,female,2009


In [16]:
# step - 1 : finding a min value of body_mass of penguins per sex
penguins_dt[:,update(temp=f.body_mass_g==dt.min(f.body_mass_g)),by(f.sex)]

In [17]:

penguins_dt[f.temp==1,f[:].remove(f.temp)]

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Adelie,Dream,37.5,18.9,179,2975,,2007
1,Chinstrap,Dream,46.9,16.6,192,2700,female,2008
2,Chinstrap,Dream,51.5,18.7,187,3250,male,2009


In [18]:
del penguins_dt["temp"]

In [19]:
penguins_tidy_dt = penguins_dt[~dt.isna(f.sex),:]

In [20]:
penguins_year_island = penguins_tidy_dt[:,{'total':count()},by(f.year,f.island)]

In [21]:
penguins_year = penguins_year_island[:,{'gr_total':dt.sum(f.total)},by(f.year)]

In [22]:
penguins_year

Unnamed: 0_level_0,year,gr_total
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪
0,2007,103
1,2008,113
2,2009,117


In [23]:
penguins_year.key="year"

In [24]:
penguins_year_island = penguins_year_island[:,:,dt.join(penguins_year)]

In [25]:
penguins_year_island[:,update(perc=f.total/f.gr_total)]

In [26]:
penguins_year_island

Unnamed: 0_level_0,year,island,total,gr_total,perc
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,2007,Biscoe,43,103,0.417476
1,2007,Dream,45,103,0.436893
2,2007,Torgersen,15,103,0.145631
3,2008,Biscoe,63,113,0.557522
4,2008,Dream,34,113,0.300885
5,2008,Torgersen,16,113,0.141593
6,2009,Biscoe,57,117,0.487179
7,2009,Dream,44,117,0.376068
8,2009,Torgersen,16,117,0.136752


In [27]:
alt.Chart(penguins_year_island.to_pandas()).mark_bar(
).encode(
    alt.Y('year:O'),
    alt.X('total'),
    alt.Color('island')
).properties(title='Island existance over the years'
            )

In [28]:
penguins_island_spec_summary = penguins_tidy_dt[:,{'total':count()},by(f.year,f.island,f.species)]

In [29]:
alt.Chart(penguins_island_spec_summary.to_pandas()).mark_bar(
).encode(
    alt.Y('year:O'),
    alt.X('total'),
    alt.Color('island')
).facet('species',columns=2)

In [33]:
penguins_gender_species_summary = penguins_tidy_dt[:,{'total':count()},by(f.sex,f.island,f.species)]

In [71]:
alt.Chart(penguins_gender_species_summary.to_pandas()).mark_bar().encode(
    alt.X('species'),
    alt.Y('total'),
    alt.Color('sex')
).properties(
    height=300,width=350,title='Penguins over different Islands'
).facet('island',columns=3)

In [63]:
penguins_sel_dt_1 = penguins_tidy_dt[:,[f.flipper_length_mm,
                              f.bill_length_mm,
                              f.sex,
                              f.body_mass_g,
                              f.species]].to_pandas()

In [72]:
alt.Chart(penguins_sel_dt_1).mark_point(
).encode(
    alt.X('flipper_length_mm'),
    alt.Y('bill_length_mm'),
    alt.Color('sex')
).properties(
    height=200,
    width=250
).facet('species').configure_mark(opacity=0.6)

In [89]:
# select only numeric columns from DT
penguins_numericos_dt = penguins_tidy_dt[:,(f[dt.int32].remove(f.year),f[dt.float64])]

In [94]:
penguins_numericos_dt

Unnamed: 0_level_0,flipper_length_mm,body_mass_g,bill_length_mm,bill_depth_mm
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,181,3750,39.1,18.7
1,186,3800,39.5,17.4
2,195,3250,40.3,18
3,193,3450,36.7,19.3
⋮,⋮,⋮,⋮,⋮
329,202,3400,43.5,18.1
330,193,3775,49.6,18.2
331,210,4100,50.8,19
332,198,3775,50.2,18.7


In [91]:
alt.Chart(penguins_numericos_dt.to_pandas()).mark_point().encode(alt.X('bill_length_mm'),alt.Y('bill_depth_mm'))

In [111]:
peng_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm','body_mass_g']

alt.Chart(penguins_numericos_dt.to_pandas()).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"),type="quantitative")
).properties(width=200,height=200).repeat(
    row=peng_names,
    column=peng_names[::-1]
).interactive()

In [112]:
penguins_tidy_dt

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,Adelie,Torgersen,39.1,18.7,181,3750,male,2007
1,Adelie,Torgersen,39.5,17.4,186,3800,female,2007
2,Adelie,Torgersen,40.3,18,195,3250,female,2007
3,Adelie,Torgersen,36.7,19.3,193,3450,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
329,Chinstrap,Dream,43.5,18.1,202,3400,female,2009
330,Chinstrap,Dream,49.6,18.2,193,3775,male,2009
331,Chinstrap,Dream,50.8,19,210,4100,male,2009
332,Chinstrap,Dream,50.2,18.7,198,3775,female,2009


In [116]:
def py_tidy_descriptive_stats(DT):
    """Generate summary statistics of datatable"""
    datos_dict = DT.to_dict()
    summary_stats_of_dict = { k:[np.nanmean(v),
                                 np.nanmedian(v),
                                 np.nanmin(v),
                                 np.nanmax(v),
                                 np.nanstd(v),
                                 np.percentile(v,25,interpolation='midpoint'),
                                 np.percentile(v,75,interpolation='midpoint'),
                                 np.percentile(v,75,interpolation='midpoint')-np.percentile(v,25,interpolation='midpoint'),
                                 np.nanstd(v)/np.sqrt(np.shape(v)[0])] for k,v in datos_dict.items() }
    summary_dict_names = dt.Frame({'descriptive_stats':['Mean','Median','Min','Max','Std','Q1','Q3','IQR','SE']})
    summary_stats_of_dict_prep = {k:list(map(lambda x:np.round(x,3),v)) for k,v in summary_stats_of_dict.items()}
    summary_stat_dt = dt.Frame(summary_stats_of_dict_prep)
    return dt.cbind(summary_dict_names,summary_stat_dt)

In [117]:
py_tidy_descriptive_stats(penguins_numericos_dt)

Unnamed: 0_level_0,descriptive_stats,flipper_length_mm,body_mass_g,bill_length_mm,bill_depth_mm
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,Mean,200.967,4207.057,43.993,17.165
1,Median,197.0,4050.0,44.5,17.3
2,Min,172.0,2700.0,32.1,13.1
3,Max,231.0,6300.0,59.6,21.5
4,Std,13.995,804.006,5.46,1.966
5,Q1,190.0,3550.0,39.5,15.6
6,Q3,213.0,4775.0,48.6,18.7
7,IQR,23.0,1225.0,9.1,3.1
8,SE,0.767,44.059,0.299,0.108


In [122]:
penguins_sex_num = dt.cbind(penguins_numericos_dt,penguins_tidy_dt[:,f.sex])

In [133]:
alt.Chart(penguins_sex_num[:,(f.flipper_length_mm,f.sex)].to_pandas()).mark_bar().encode(alt.X('flipper_length_mm',bin=True),y='count()').facet('sex')

In [140]:
alt.Chart(penguins_sex_num[:,(f.bill_length_mm,f.sex)].to_pandas()).mark_bar().encode(alt.X('bill_length_mm',bin=True),y='count()').facet('sex')

In [141]:
alt.Chart(penguins_sex_num[:,(f.body_mass_g,f.sex)].to_pandas()).mark_bar().encode(alt.X('body_mass_g',bin=True),y='count()').facet('sex')