# 重塑和轴向旋转

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

## df.stack() / df.unstack() - 重塑层次化索引


In [2]:
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [3]:
result = data.stack() # 把列变成第2层索引
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [4]:
result.unstack() # 把第二层索引再变回列，默认参数为1


number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [7]:
result.unstack(0) # 默认对第2层unstack变成列，也可以手工指定。


state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [8]:
result.unstack('state') # 可以指定要unstack的行索引的名字


state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [9]:
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [10]:
data2.unstack() # 缺失值自动填充NAN

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [15]:
# cc：index must be a MultiIndex to unstack，发现用unstack 的时候必须是MultiIndex 多重索引
s1.unstack() 

ValueError: index must be a MultiIndex to unstack, <class 'pandas.core.indexes.base.Index'> was passed

In [11]:
data2.unstack().stack() # 自动过滤缺失值


one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [16]:
data2.unstack().stack(dropna=False) # 保留缺失值

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [17]:
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [18]:
 df.unstack('state') # state在side下面，成为最底层列名
# cc：stack 直译过来是堆叠，则unstack 可以理解为“展开”；
# 以上是对state展开（原来以行的形式堆叠在一起，现在以列的形式做展开，展开的同时可以再考虑其他维度，直接套过来即可）

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [19]:
df.unstack('state').stack('side') # side变成行索引，但是在最内侧。

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


## df.melt - 将“长格式”旋转为“宽格式”


其实就是转置吧

In [32]:
ldata = pd.read_csv('data/macrodata.csv')
ldata.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [33]:
date = pd.PeriodIndex(year=ldata.year, quarter=ldata.quarter, freq='Q')
ldata['date'] = date.asfreq('M', 'e').asfreq('H', 's').values
# cc：以上两句的用法很微妙，会用到~


官方文档`PeriodIndex` 相关：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.PeriodIndex.html?highlight=periodindex#pandas.PeriodIndex


In [29]:
# cc：之前有遇到的需求，这里作为一个例子：

# 先来构造一下数据；
ldata['month'] = 7 #假设都是7月

date_new = pd.PeriodIndex(year=ldata.year, month=ldata.month, freq='M')
ldata['date_new'] = date_new # cc：到这儿已经满足需求了，除此之外可以再官方文档查看还有更多的实现，关于时间序列相关后面单独开个notebook
ldata.head()


Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint,date,month,date_new
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0,1959-03-01 00:00,7,1959-07
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74,1959-06-01 00:00,7,1959-07
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09,1959-09-01 00:00,7,1959-07
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06,1959-12-01 00:00,7,1959-07
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19,1960-03-01 00:00,7,1960-07


In [30]:
ldata = ldata.loc[:, ['date', 'realgdp', 'infl', 'unemp']]
ldata

# cc：这里是做了行转列
ldata = pd.melt(ldata, 
                id_vars = ['date'], 
                value_vars=['realgdp', 'infl', 'unemp'], 
                var_name='item') # melt与pivot正好对应

ldata

Unnamed: 0,date,item,value
0,1959-03-01 00:00,realgdp,2710.349
1,1959-06-01 00:00,realgdp,2778.801
2,1959-09-01 00:00,realgdp,2775.488
3,1959-12-01 00:00,realgdp,2785.204
4,1960-03-01 00:00,realgdp,2847.699
...,...,...,...
604,2008-09-01 00:00,unemp,6.000
605,2008-12-01 00:00,unemp,6.900
606,2009-03-01 00:00,unemp,8.100
607,2009-06-01 00:00,unemp,9.200


## df.pivot 数据透视表

官方文档：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html

In [12]:
pivoted = ldata.pivot('date', 'item', 'value') # 把item下面的值变回到列上
pivoted.head()

# pivot参数说明如下：
# DataFrame.pivot(index=None, columns=None, values=None)

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-01 00:00,0.0,2710.349,5.8
1959-06-01 00:00,2.34,2778.801,5.1
1959-09-01 00:00,2.74,2775.488,5.3
1959-12-01 00:00,0.27,2785.204,5.6
1960-03-01 00:00,2.31,2847.699,5.2


In [13]:
# 拿掉value是这样的。
pivoted = ldata.pivot('date', 'item') # 把item下面的值变回到列上
pivoted.head()

Unnamed: 0_level_0,value,value,value
item,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1959-03-01 00:00,0.0,2710.349,5.8
1959-06-01 00:00,2.34,2778.801,5.1
1959-09-01 00:00,2.74,2775.488,5.3
1959-12-01 00:00,0.27,2785.204,5.6
1960-03-01 00:00,2.31,2847.699,5.2


In [21]:
ldata['value2'] = np.random.randn(len(ldata))

# date作为行索引
# 因为melt后的item对应2个值，所以value2作为外部列名，item的每一项作为内部列名。
pivoted = ldata.pivot('date', 'item')
pivoted.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2,month,month,month
item,infl,realgdp,unemp,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1959-03-01 00:00,0.0,2710.349,5.8,-0.302494,-0.114487,-1.275114,7,7,7
1959-06-01 00:00,2.34,2778.801,5.1,-0.845628,-1.451921,-0.475165,7,7,7
1959-09-01 00:00,2.74,2775.488,5.3,1.060127,-0.137157,0.470683,7,7,7
1959-12-01 00:00,0.27,2785.204,5.6,-0.1836,1.36326,-0.89956,7,7,7
1960-03-01 00:00,2.31,2847.699,5.2,-0.645694,-0.447991,0.60756,7,7,7


In [16]:
unstacked = ldata.set_index(['date', 'item']).unstack('item') # 与pivot('date', 'item')等价
unstacked.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-01 00:00,0.0,2710.349,5.8,-0.051205,-0.025836,1.258929
1959-06-01 00:00,2.34,2778.801,5.1,-1.602371,-0.180203,-0.848458
1959-09-01 00:00,2.74,2775.488,5.3,-1.945448,1.573056,0.236818
1959-12-01 00:00,0.27,2785.204,5.6,0.300125,-0.466428,0.735348
1960-03-01 00:00,2.31,2847.699,5.2,0.044653,0.916515,-1.07272


In [19]:
ldata.set_index(['date', 'item'])
# 这是stack堆叠的状态，也就是item各项还在各行“堆叠”
# 所以通弄过unstack，就达成了行转列的目的。

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
date,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-01 00:00,realgdp,2710.349,-0.025836
1959-06-01 00:00,realgdp,2778.801,-0.180203
1959-09-01 00:00,realgdp,2775.488,1.573056
1959-12-01 00:00,realgdp,2785.204,-0.466428
1960-03-01 00:00,realgdp,2847.699,0.916515
...,...,...,...
2008-09-01 00:00,unemp,6.000,1.125638
2008-12-01 00:00,unemp,6.900,-2.230287
2009-03-01 00:00,unemp,8.100,-1.290653
2009-06-01 00:00,unemp,9.200,-1.472111
