In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font='Noto Sans CJK TC')
mpl.style.use('ggplot')  # must put after sns.set

In [2]:
health_lists = [
    [152, 48, 63, 1],
    [157, 53, 41, 1],
    [140, 37, 63, 0],
    [137, 32, 65, 0],
]

## DataFrame

Just like an Excel sheet.

- Series: A column is a series.
- Index: The values to identify the rows or the columns. Sometimes meant the direction of rows.
    - Label: a value in an index.

### Create

In [3]:
health_df = pd.DataFrame(
    health_lists,
    # usually the rows are long, we would let pandas decides the index
    index=['A', 'B', 'C', 'D'],
    columns=['height_cm', 'weight_kg', 'age', 'male_yn']
)
health_df

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1
C,140,37,63,0
D,137,32,65,0


### Read

#### Overview

In [4]:
display(
    # column index
    health_df.columns,
    # row index
    health_df.index,
)

Index(['height_cm', 'weight_kg', 'age', 'male_yn'], dtype='object')

Index(['A', 'B', 'C', 'D'], dtype='object')

In [5]:
display(
    health_df.shape,
    # data types
    health_df.dtypes,
)

(4, 4)

height_cm    int64
weight_kg    int64
age          int64
male_yn      int64
dtype: object

In [6]:
# ndarray (n-dimensional array) in numpy
health_df.values

array([[152,  48,  63,   1],
       [157,  53,  41,   1],
       [140,  37,  63,   0],
       [137,  32,  65,   0]])

#### Dict-Like Access

In [7]:
# -> a column in a series
health_df['height_cm']

A    152
B    157
C    140
D    137
Name: height_cm, dtype: int64

In [8]:
# shortcut
health_df.height_cm

A    152
B    157
C    140
D    137
Name: height_cm, dtype: int64

In [9]:
# -> a value
health_df['height_cm']['A']

152

In [10]:
# -> columns in dataframe
health_df[['height_cm', 'weight_kg']]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


#### List-Like Access

In [11]:
# -> rows in dataframe
health_df[:2]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1


In [12]:
# all rows, the two columns
health_df.loc[:, ['height_cm', 'weight_kg']]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


In [13]:
# all rows, until the column
health_df.loc[:, :'weight_kg']

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53
C,140,37
D,137,32


In [14]:
# until the row, until the column
health_df.loc[:'B', :'weight_kg']

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53


In [15]:
# the first two rows, the first two columns
health_df.iloc[:2, :2]

Unnamed: 0,height_cm,weight_kg
A,152,48
B,157,53


#### Boolean Indexing

In [16]:
health_df[health_df.male_yn == 1]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1


In [17]:
health_df[(health_df.male_yn == 1) & (health_df.age > 60)]

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1


#### Transpose

In [18]:
health_df.T

Unnamed: 0,A,B,C,D
height_cm,152,157,140,137
weight_kg,48,53,37,32
age,63,41,63,65
male_yn,1,1,0,0


### Update

#### BMI

$ BMI = \dfrac{weight}{height^{2}} $

Where:

- $ weight $: weight in kg
- $ height $: height in m

In [19]:
bmi_s = health_df.weight_kg / (health_df.height_cm/100)**2
# or
#bmi_s = health_df.weight_kg / (health_df.height_cm/100).pow(2)
# or
#bmi_s = health_df.weight_kg / np.pow(health_df.height_cm/100, 2)
bmi_s

A    20.775623
B    21.501886
C    18.877551
D    17.049390
dtype: float64

#### BMR

$ P = 10m + 6.25h - 5a + s $

Where:

- $ P $: BMR, kcal / day
- $ m $: weight in kg
- $ h $: height in cm
- $ a $: age in year
- $ s $: +5 for males, -161 for females

$ \equiv $

$ P = 10m + 6.25h - 5a + 5g - 161(1-g) $

Where:

- $ g $: gender in int, 0 is female, 1 is male.

In [20]:
# just avoid to affect the other cells
tmp_df = health_df.copy()
tmp_df['female_yn'] = 1 - health_df.male_yn
tmp_df

Unnamed: 0,height_cm,weight_kg,age,male_yn,female_yn
A,152,48,63,1,0
B,157,53,41,1,0
C,140,37,63,0,1
D,137,32,65,0,1


In [21]:
# .sum(): sum of all values
# .sum(axis=0): sum along 0th axis = rows, down, or variable
# .sum(axis=1): sum along 1st axis = columns, left, or sample
bmr_s = (tmp_df * [6.25, 10, -5, 5, -161]).sum(axis=1)
# or
#bmr_s = health_df.dot([6.25, 10, -5, 5, -161])
bmr_s

A    1120.00
B    1311.25
C     769.00
D     690.25
dtype: float64

In [22]:
tmp_df['bmi'] = bmi_s
tmp_df['bmr'] = bmr_s
tmp_df

Unnamed: 0,height_cm,weight_kg,age,male_yn,female_yn,bmi,bmr
A,152,48,63,1,0,20.775623,1120.0
B,157,53,41,1,0,21.501886,1311.25
C,140,37,63,0,1,18.877551,769.0
D,137,32,65,0,1,17.04939,690.25


### Delete

In [23]:
tmp_df = health_df.copy()
tmp_df

Unnamed: 0,height_cm,weight_kg,age,male_yn
A,152,48,63,1
B,157,53,41,1
C,140,37,63,0
D,137,32,65,0


In [24]:
del tmp_df['male_yn']
tmp_df

Unnamed: 0,height_cm,weight_kg,age
A,152,48,63
B,157,53,41
C,140,37,63
D,137,32,65


## NumPy

Let's borrow the [linear regression's math notations](https://en.wikipedia.org/wiki/Linear_regression#Introduction) to calculate the BMR.

$ {\displaystyle \{y_{i},\,x_{i1},\ldots ,x_{ip}\}_{i=1}^{n}} $

* $ y_i $: the dependent variable of the $ i $-th statistical unit.
* $ x_{ip} $: the $ p $-th independent variable of the $ i $-th statistical unit.
* $ n $: the number of statistical units.

$ {\displaystyle y_{i}=\beta _{0}1+\beta _{1}x_{i1}+\cdots +\beta _{p}x_{ip}+\varepsilon _{i}=\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }}+\varepsilon _{i},\qquad i=1,\ldots ,n,} $

* $ \beta $: the parameter.
* $ \varepsilon $: the error term or noise, an unobserved random variable.
* $ \mathbf {x}_{i} $, $ {\boldsymbol {\beta }} $: vectors.

Stack these $ n $ equations:

$ {\displaystyle \mathbf {y} =X{\boldsymbol {\beta }}+{\boldsymbol {\varepsilon }},\,} $

Where:

$ \mathbf {y} ={\begin{pmatrix}y_{1}\\y_{2}\\\vdots \\y_{n}\end{pmatrix}},\quad $

$ {\displaystyle X={\begin{pmatrix}\mathbf {x} _{1}^{\top }\\\mathbf {x} _{2}^{\top }\\\vdots \\\mathbf {x} _{n}^{\top }\end{pmatrix}}={\begin{pmatrix}1&x_{11}&\cdots &x_{1p}\\1&x_{21}&\cdots &x_{2p}\\\vdots &\vdots &\ddots &\vdots \\1&x_{n1}&\cdots &x_{np}\end{pmatrix}},} $

$ {\displaystyle {\boldsymbol {\beta }}={\begin{pmatrix}\beta _{0}\\\beta _{1}\\\beta _{2}\\\vdots \\\beta _{p}\end{pmatrix}},\quad {\boldsymbol {\varepsilon }}={\begin{pmatrix}\varepsilon _{1}\\\varepsilon _{2}\\\vdots \\\varepsilon _{n}\end{pmatrix}}.} $

### Link the Math and Python – `mat`

In [25]:
health_m = np.mat(health_lists)

In [26]:
X_m = np.hstack((health_m, 1-health_m[:, -1:]))
X_m

matrix([[152,  48,  63,   1,   0],
        [157,  53,  41,   1,   0],
        [140,  37,  63,   0,   1],
        [137,  32,  65,   0,   1]])

In [27]:
beta_m = np.mat([[6.25, 10, -5, 5, -161]]).T
beta_m

matrix([[   6.25],
        [  10.  ],
        [  -5.  ],
        [   5.  ],
        [-161.  ]])

In [28]:
X_m*beta_m

matrix([[ 1120.  ],
        [ 1311.25],
        [  769.  ],
        [  690.25]])

### But Usually We Don't Care the Math – `array`

In [29]:
health_a = np.array(health_lists)

In [30]:
X_a = np.hstack((health_a, 1-health_a[:, -1:]))
X_a

array([[152,  48,  63,   1,   0],
       [157,  53,  41,   1,   0],
       [140,  37,  63,   0,   1],
       [137,  32,  65,   0,   1]])

In [31]:
# we rely on broadcasting
beta_a = np.array([6.25, 10, -5, 5, -161])
beta_a

array([   6.25,   10.  ,   -5.  ,    5.  , -161.  ])

In [32]:
X_a*beta_a

array([[ 950.  ,  480.  , -315.  ,    5.  ,   -0.  ],
       [ 981.25,  530.  , -205.  ,    5.  ,   -0.  ],
       [ 875.  ,  370.  , -315.  ,    0.  , -161.  ],
       [ 856.25,  320.  , -325.  ,    0.  , -161.  ]])

In [33]:
(X_a*beta_a).sum(axis=1)

array([ 1120.  ,  1311.25,   769.  ,   690.25])

In [34]:
X_a.dot(beta_a)

array([ 1120.  ,  1311.25,   769.  ,   690.25])

But when you are confusing, use `mat` to represent the math in Python.

## Dig More

- [DataFrame – Pandas](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
- [10 Minutes to pandas – Pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html)
- [Mathematical functions – NumPy](https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.math.html)
- [Broadcasting – NumPy](https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html)
- [Linear algebra – NumPy](https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.linalg.html)