## 簡介
除了python 原生的 dict, array之外, 介紹各類型Data Structure在 numpy, pandas 的使用狀況

- vector    **array    numpy.array**
- list      **pandas.Series**
- class     **class**
- scalar
- matrix    **numpy.matrix**
- array     **numpy.array**
- factors   **enum**
- DataFrame **pandas.Dataframe**



In [1]:
import numpy as np
import matplotlib as mp
import pandas as pd
%matplotlib inline
import sklearn
import os
import sys

In [2]:
a= np.pi
b = np.array([2.7,np.pi])
c = np.array(["a","b"],dtype = np.str)

In [41]:
np.array([1,"1"])

array(['1', '1'],
      dtype='<U21')

### [factor](https://stackoverflow.com/questions/34682420/python-how-to-convert-a-string-array-to-a-factor-list)


In [12]:
classes = np.array(['a', 'b', 'c', 'c', 'b', 'a', 'a', 'd'])
classnames, indices = np.unique(classes, return_inverse=True)
print(classnames)
print(indices)
print(classnames[indices])

['a' 'b' 'c' 'd']
[0 1 2 2 1 0 0 3]
['a' 'b' 'c' 'c' 'b' 'a' 'a' 'd']


## List

In [31]:
d = {"A":"a","B": 1}
pd.Series(d)

A    a
B    1
dtype: object

## DataFrame

In [32]:
pd.DataFrame(data = {"x":np.arange(3),"y":["2"]*3})

Unnamed: 0,x,y
0,0,2
1,1,2
2,2,2


## [Date](https://docs.scipy.org/doc/numpy-1.12.0/reference/arrays.datetime.html)

In [34]:
np.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64')

array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]')

In [35]:
np.datetime64('2009-01-01') - np.datetime64('2008-01-01')

numpy.timedelta64(366,'D')

## [Matrix](https://docs.scipy.org/doc/numpy-1.12.0/reference/generated/numpy.matrix.html)

In [36]:
np.matrix('1 2; 3 4')

matrix([[1, 2],
        [3, 4]])

In [37]:
 np.matrix([[1, 2], [3, 4]])

matrix([[1, 2],
        [3, 4]])

In [40]:

np.matrix(np.arange(12).reshape((3,4)))

matrix([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

## 4.1 附加資料到向量

In [42]:
a = np.array([1,2,3])

In [46]:
np.append(a,1)

array([1, 2, 3, 1])

In [47]:
np.append(a,[3,4])

array([1, 2, 3, 3, 4])

## 4.2 將資料插入向量
請參考 [np.insert](https://docs.scipy.org/doc/numpy/reference/generated/numpy.insert.html#numpy.insert)

In [50]:
np.insert(a,1,5)

array([1, 5, 2, 3])

## 4.3 循環規則
Python 沒有這個性質

## 4.4 產生因子
 - enumerate
 - [numpy.ndenumerate](https://docs.scipy.org/doc/numpy-1.12.0/reference/generated/numpy.ndenumerate.html) 

目前還沒有看見更好的解法
有接近的[解法](https://stackoverflow.com/questions/15124439/closest-equivalent-of-a-factor-variable-in-python-pandas)

In [52]:
b = np.array([[1, 2], [3, 4]])
np.ndenumerate(b)

<numpy.lib.index_tricks.ndenumerate at 0x10e94d668>

In [62]:
s = pd.Series(["a","b","c","a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

## 4.5 合併多向量成為單一向量與對應因子

In [68]:
data = {"a":[1,2],"b":['abc','zxy']}
pd.DataFrame(pd.DataFrame(data).stack())

Unnamed: 0,Unnamed: 1,0
0,a,1
0,b,abc
1,a,2
1,b,zxy


## 4.6 建立list


In [72]:
a = pd.Series({"a":1,"b":"3sdf","c":np.mean})

In [73]:
a["a"]

1

In [76]:
a["c"]([1,2])

1.5

## 4.7 選取list中的元素
- index
- name


In [77]:
a[0]

1

In [78]:
a[1]

'3sdf'

In [79]:
a[2]

<function numpy.core.fromnumeric.mean>

In [100]:
df = pd.DataFrame(data = a).T
df.index = ["1asdf"]
df

Unnamed: 0,a,b,c
1asdf,1,3sdf,<function mean at 0x10ac18e18>


In [101]:
#https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-integer
df.iloc[0,:2] # 用row index

a       1
b    3sdf
Name: 1asdf, dtype: object

## 5.8 用名稱選取列表元素

In [104]:
df.loc["1asdf",:] #用row name

a                                 1
b                              3sdf
c    <function mean at 0x10ac18e18>
Name: 1asdf, dtype: object

## 5.9 建立一個key-value list

In [105]:
pd.Series({"a":31312,"b":["GG"]})

a    31312
b     [GG]
dtype: object

## 5.10 從列表中移出元素
可以參考[pd.drop](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html) 或者是[範例](https://stackoverflow.com/questions/14661701/how-to-drop-a-list-of-rows-from-pandas-dataframe)

## 5.11 將列表轉換為向量


In [107]:
df.values

array([[1, '3sdf', <function mean at 0x10ac18e18>]], dtype=object)

## 5.12 移除列表中na值

In [110]:
a = pd.DataFrame(data = [[1,np.NaN],[2,2]])
a

In [112]:
a.dropna()

Unnamed: 0,0,1
1,2,2.0


## 5.13 條件式移除列表元素

In [117]:
a = pd.DataFrame(data = [np.zeros(9),np.arange(9)+2,np.linspace(100,108,9)])

In [118]:
a

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0
2,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,108.0


In [121]:
a == 0 

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,True,True,True,True,True,True,True,True,True
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False


In [122]:
a [ a > 5 ]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,,,,,,,,
1,,,,,6.0,7.0,8.0,9.0,10.0
2,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,108.0


In [123]:
a [ a > 5 ] = -1

In [124]:
a

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,3.0,4.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 5.14 初始化矩陣

In [128]:
np.matrix(np.arange(6)).reshape(2,3)

matrix([[0, 1, 2],
        [3, 4, 5]])

## 5.15 矩陣運算
- [np.linalg](https://docs.scipy.org/doc/numpy-1.12.0/reference/routines.linalg.html)

In [135]:
a = np.matrix(np.arange(4)).reshape(2,2)
b = np.ones(4).reshape(2,2)*2

In [138]:
a

matrix([[0, 1],
        [2, 3]])

In [141]:
b

array([[ 2.,  2.],
       [ 2.,  2.]])

In [137]:
# transpose
a.T

matrix([[0, 2],
        [1, 3]])

In [139]:
#inverse matrix
np.linalg.inv(a)

matrix([[-1.5,  0.5],
        [ 1. ,  0. ]])

In [158]:
# multiply
np.matmul(a,b)

matrix([[  2.,   2.],
        [ 10.,  10.]])

## 4.16 給DataFrame 欄名稱與列名稱

In [7]:
a = pd.DataFrame(data=[[1,2],[3,4]])
a

Unnamed: 0,0,1
0,1,2
1,3,4


In [12]:
# column name
a.columns = ['A','B']
a

Unnamed: 0,A,B
a,1,2
b,3,4


In [11]:
# row name
a.index = ['a','b']
a

Unnamed: 0,A,B
a,1,2
b,3,4


## 4.17從矩陣中選擇一列或一行資料

In [17]:
M = np.matrix(np.arange(4).reshape(2,2))

In [21]:
# 第二列資料
M[1,:]

matrix([[2, 3]])

In [20]:
# 第一行資料
M[:,0]

matrix([[0],
        [2]])

In [29]:
# 取出並成為單一向量 (要變成一維), 不然要用 A
M[:,0].A1

1

## 4.18 從一堆行資料組成DataFrame

In [96]:
year = [2017,2018,2019]
month = [1,2,3]
Name = ['you','me','him']
pd.DataFrame.from_dict({"year":year,"month":month,"Name":Name})

Unnamed: 0,Name,month,year
0,you,1,2017
1,me,2,2018
2,him,3,2019


## 4.19從一堆列資料組成DataFrame
類似R do.call的做法,[參考連結](https://stackoverflow.com/questions/37709265/python-equivalent-for-do-callrbind-lapply-from-r)

In [99]:
a = [ {"A":1,"B":23,"C":321},{"A":1,"B":11,"C":321},{"A":1,"B":44,"C":321} ] 
b = pd.DataFrame.from_records(a)
b

Unnamed: 0,A,B,C
0,1,23,321
1,1,11,321
2,1,44,321


## 4.20 增加一列到DataFrame

In [82]:
c = pd.DataFrame(b.iloc[0]).T
c

Unnamed: 0,A,B,C
0,1,23,321


In [88]:
#避免使用，效能不好
pd.concat([b,c])

Unnamed: 0,A,B,C
0,1,23,321
0,1,11,321
0,1,44,321
0,1,23,321


## 4.21 預先配置DataFrame memory
沒找到QQ
倒是找到指定特定欄位資訊

## 4.22 根據位置選擇資料框架裡的資料


In [103]:
#第二行資料  Series
b.iloc[:,1]

0    23
1    11
2    44
Name: B, dtype: int64

In [105]:
#第二行資料
b.iloc[:,1].to_frame()

Unnamed: 0,B
0,23
1,11
2,44


In [111]:
#第二,三行資料 , 直接是DataFrame
b.iloc[:,1:3]

Unnamed: 0,B,C
0,23,321
1,11,321
2,44,321


In [112]:
#第ㄧ,三行資料 , 直接是DataFrame
b.iloc[:,[0,2]]

Unnamed: 0,A,C
0,1,321
1,1,321
2,1,321


## 4.23 從名稱選取行資料
- [[]] 會取出DataFrame
- []會取出 Series

In [119]:
b.loc[:,["A","C"]]

Unnamed: 0,A,C
0,1,321
1,1,321
2,1,321


In [120]:
b.loc[:,"A"]

0    1
1    1
2    1
Name: A, dtype: int64

## 4.24 選取子集
使用 dataframe loc 與 iloc就能做到

## 4.25 變更col name
```python
dataFrame.columns = [xxxxxx]
```

## 4.26 直接透過編輯器編輯DataFrame內容
好像沒有看見, 可以參考此[連結](https://stackoverflow.com/questions/10636024/python-pandas-gui-for-viewing-a-dataframe-or-matrix)

## 4.27 移除遺失值
小心對統計數值的影響

In [125]:
b.iloc[0,0] = np.NaN
b

Unnamed: 0,A,B,C
0,,23,321
1,1.0,11,321
2,1.0,44,321


In [126]:
b.dropna()

Unnamed: 0,A,B,C
1,1.0,11,321
2,1.0,44,321


## 4.28 根據名稱排除變數

In [133]:
# 踢掉行
b.drop(labels= 'C',axis=1)

Unnamed: 0,A,B
0,,23
1,1.0,11
2,1.0,44


In [134]:
# 踢掉列
b.drop(0)

Unnamed: 0,A,B,C
1,1.0,11,321
2,1.0,44,321


## 4.29 結合兩個框架

In [137]:
#垂直增加
# pd.concat([b,b],axis=0)
b.append(b)


Unnamed: 0,A,B,C
0,,23,321
1,1.0,11,321
2,1.0,44,321
0,,23,321
1,1.0,11,321
2,1.0,44,321


In [139]:
#水瓶增加
pd.concat([b,b],axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,,23,321,,23,321
1,1.0,11,321,1.0,11,321
2,1.0,44,321,1.0,44,321


## 4.30  dataframe合併處理法
[Merge, join, and concatenate 參考文件](https://pandas.pydata.org/pandas-docs/stable/merging.html)

In [7]:
#data 
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])


df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])


df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']},
                   index=[8, 9, 10, 11])

df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                     'D': ['D2', 'D3', 'D6', 'D7'],
                     'F': ['F2', 'F3', 'F6', 'F7']},
                    index=[2, 3, 6, 7])

In [3]:
pd.concat([df1,df2])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [5]:
pd.concat([df1,df2],keys=['x','y'])

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7


In [8]:
# default outter join
pd.concat([df1, df4], axis=1)

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [9]:
pd.concat([df1, df4], axis=1,join='inner')

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [10]:
# left join
pd.concat([df1, df4], axis=1, join_axes=[df1.index])

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [11]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
 'A': ['A0', 'A1', 'A2', 'A3'],
 'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

In [12]:
pd.merge(left, right, on='key')

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3
