### 讀取資料
首先，我們用 pandas 讀取最主要的資料 application_train.csv

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
# 設定 data_path
dir_data = './data/'

#### 用 pd.read_csv 來讀取資料

In [4]:
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app)

Path of read in data: ./data/application_train.csv


#### Note: 在 jupyter notebook 中，可以使用 `?` 來調查函數的定義

In [5]:
# for example
?pd.read_csv

#### 接下來我們可以用 .head() 這個函數來觀察前 5 row 資料

In [6]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## 練習時間
資料的操作有很多，接下來的馬拉松中我們會介紹常被使用到的操作，參加者不妨先自行想像一下，第一次看到資料，我們一般會想知道什麼訊息？

#### Ex: 如何知道資料的 row 數以及 column 數、有什麼欄位、多少欄位、如何截取部分的資料等等

有了對資料的好奇之後，我們又怎麼通過程式碼來達成我們的目的呢？

#### 可參考該[基礎教材](https://bookdata.readthedocs.io/en/latest/base/01_pandas.html#DataFrame-%E5%85%A5%E9%97%A8)或自行 google

#### 資料的 row 數以及 column 數

In [14]:
row_number, col_number = app_train.shape
print('Number of data row : %i' % (row_number))
print('Number of data column : %i' % (col_number))

Number of data row : 307511
Number of data column : 122


#### 列出所有欄位

In [16]:
print('Column of data : %s' % (app_train.columns))

Column of data : Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)


#### 截取部分資料 (總收入大於 200000 且沒有小孩)

In [23]:
app_train[(app_train['AMT_INCOME_TOTAL'] > 200000.0) & (app_train['CNT_CHILDREN'] == 0)] 

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
7,100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
49,100056,0,Cash loans,M,Y,Y,0,360000.0,1506816.0,49927.5,...,0,0,0,0,,,,,,
59,100070,0,Cash loans,M,Y,Y,0,540000.0,1227901.5,46899.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
62,100073,0,Cash loans,M,Y,Y,0,324000.0,1130760.0,40189.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
84,100099,0,Cash loans,F,N,Y,0,360000.0,733315.5,41076.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
94,100112,1,Cash loans,M,Y,Y,0,315000.0,953460.0,64107.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
188,100218,0,Cash loans,M,Y,Y,0,337500.0,876078.0,49050.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
219,100255,0,Cash loans,M,Y,Y,0,315000.0,824544.0,52825.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
225,100262,0,Cash loans,M,Y,Y,0,315000.0,1575000.0,41548.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
236,100274,0,Cash loans,M,Y,Y,0,308250.0,1305000.0,38281.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
