In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font='Noto Sans CJK TC')
mpl.style.use('ggplot')  # must put after sns.set

## DataFrame

Just like an Excel sheet.

- Series: A column is a series.
- Index: The values to identify the rows or the columns. Sometimes meant the direction of rows.
    - Label: a value in an index.

### Create

In [2]:
health_lists = [
    [152, 48, 63, 1],
    [157, 53, 41, 1],
    [140, 37, 63, 0],
    [137, 32, 65, 0],
]

In [3]:
health_pd = pd.DataFrame(
    health_lists,
    # usually the rows are long, we would let pandas decides the index
    index=['A', 'B', 'C', 'D'],
    columns=['height_cm', 'weight_kg', 'age', 'male_yn']
)
health_pd

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1
C,140,37,63,0
D,137,32,65,0


### Read

#### Overview

In [4]:
display(
    # column index
    health_pd.columns,
    # row index
    health_pd.index,
)

Index(['height_cm', 'weight_kg', 'age', 'male_yn'], dtype='object')

Index(['A', 'B', 'C', 'D'], dtype='object')

In [5]:
display(
    health_pd.shape,
    # data types
    health_pd.dtypes,
)

(4, 4)

height_cm    int64
weight_kg    int64
age          int64
male_yn      int64
dtype: object

In [6]:
# ndarray (n-dimensional array) in numpy
health_pd.values

array([[152,  48,  63,   1],
       [157,  53,  41,   1],
       [140,  37,  63,   0],
       [137,  32,  65,   0]])

#### Dict-Like Access

In [7]:
# -> a column in a series
health_pd['height_cm']

A    152
B    157
C    140
D    137
Name: height_cm, dtype: int64

In [8]:
# shortcut
health_pd.height_cm

A    152
B    157
C    140
D    137
Name: height_cm, dtype: int64

In [9]:
# -> a value
health_pd['height_cm']['A']

152

In [10]:
# -> columns in dataframe
health_pd[['height_cm', 'weight_kg']]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


#### List-Like Access

In [11]:
# -> rows in dataframe
health_pd[:2]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1


In [12]:
# all rows, the two columns
health_pd.loc[:, ['height_cm', 'weight_kg']]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


In [13]:
# all rows, until the column
health_pd.loc[:, :'weight_kg']

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


In [14]:
# until the row, until the column
health_pd.loc[:'B', :'weight_kg']

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53


In [15]:
# the first two rows, the first two columns
health_pd.iloc[:2, :2]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53


#### Boolean Indexing

In [16]:
health_pd[health_pd.male_yn == 1]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1


In [17]:
health_pd[(health_pd.male_yn == 1) & (health_pd.age > 60)]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1


#### Transpose

In [18]:
health_pd.T

Unnamed: 0,A,B,C,D
height_cm,152,157,140,137
weight_kg,48,53,37,32
age,63,41,63,65
male_yn,1,1,0,0


### Update

In [19]:
health_pd_2 = health_pd.copy()
health_pd_2

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1
C,140,37,63,0
D,137,32,65,0


#### BMI

$ BMI = \dfrac{weight}{height^{2}} $

Where:

- $ weight $: weight in kg
- $ height $: height in m

In [20]:
bmi_s = health_pd_2.weight_kg / (health_pd_2.height_cm/100)**2
# or
#bmi_s = health_pd_2.weight_kg / (health_pd_2.height_cm/100).pow(2)
# or
#bmi_s = health_pd_2.weight_kg / np.pow(health_pd_2.height_cm/100, 2)
bmi_s

A    20.775623
B    21.501886
C    18.877551
D    17.049390
dtype: float64

#### BMR

$ P = 10m + 6.25h - 5a + s $

Where:

- $ P $: BMR, kcal / day
- $ m $: weight in kg
- $ h $: height in cm
- $ a $: age in year
- $ s $: +5 for males, -161 for females

$ \equiv $

$ P = 10m + 6.25h - 5a + 5g - 161(1-g) $

Where:

- $ g $: gender in int, 0 is female, 1 is male.

In [21]:
health_pd_2['female_yn'] = 1 - health_pd_2.male_yn
health_pd_2

Unnamed: 0,height_cm,weight_kg,age,male_yn,female_yn
A,152,48,63,1,0
B,157,53,41,1,0
C,140,37,63,0,1
D,137,32,65,0,1


In [22]:
# .sum(): sum of all values
# .sum(axis=0): sum along 0th axis = rows, down, or variable
# .sum(axis=1): sum along 1st axis = columns, left, or sample
bmr_s = (health_pd_2 * np.array([6.25, 10, -5, 5, -161])).sum(axis=1)
bmr_s

A    1120.00
B    1311.25
C     769.00
D     690.25
dtype: float64

In [23]:
health_pd_2['bmi'] = bmi_s
health_pd_2['bmr'] = bmr_s
health_pd_2

Unnamed: 0,height_cm,weight_kg,age,male_yn,female_yn,bmi,bmr
A,152,48,63,1,0,20.775623,1120.0
B,157,53,41,1,0,21.501886,1311.25
C,140,37,63,0,1,18.877551,769.0
D,137,32,65,0,1,17.04939,690.25


### Delete

In [24]:
health_pd_3 = health_pd.copy()
health_pd_3

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1
C,140,37,63,0
D,137,32,65,0


In [25]:
del health_pd_3['male_yn']
health_pd_3

Unnamed: 0,height_cm,weight_kg,age
A,152,48,63
B,157,53,41
C,140,37,63
D,137,32,65


## Dig More

- [DataFrame – Pandas](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
- [Mathematical functions – NumPy](https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.math.html)
- [Broadcasting – NumPy](https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html)