# Tidy Data
三个相关概念：
- 每个变量（variable）都是一列（Column）；每一列都是一个变量
- 每个 observation 都是行（row）；每一行都是 observation
- 每个值都是一个 cell；每个 cell 都是一个值

![concept](./pic/tidy-1.png)

In [8]:
import pandas as pd
import numpy as np

# Melt

![melt](./pic/reshaping_melt.png)

In [5]:
df = pd.DataFrame(
    {
        "first": ["John", "Mary"],
        "last": ["Doe", "Bo"],
        "job": ["Nurse", "Economist"],
        "height": [5.5, 6.0],
        "weight": [130, 150],
    }
)
print("\n Unmelted: ")
print(df)
print("\n Melted: ")
df.melt(id_vars=["first", "last"], var_name="quantity", value_vars=["height", "weight"])


 Unmelted: 
  first last        job  height  weight
0  John  Doe      Nurse     5.5     130
1  Mary   Bo  Economist     6.0     150

 Melted: 


Unnamed: 0,first,last,quantity,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


# Wide to long

In [11]:
df = pd.DataFrame(
    {
        "A1970": {0: "a", 1: "b", 2: "c"},
        "A1980": {0: "d", 1: "e", 2: "f"},
        "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
        "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
        "X": dict(zip(range(3), np.random.randn(3))),
        "id": dict(zip(range(3), range(3))),
    }
)
df

Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,0.17142,0
1,b,e,1.2,1.3,-0.09799,1
2,c,f,0.7,0.1,-1.343753,2


In [12]:
pd.wide_to_long(df, stubnames=["A", "B"], i="id", j="year")

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,0.17142,a,2.5
1,1970,-0.09799,b,1.2
2,1970,-1.343753,c,0.7
0,1980,0.17142,d,3.2
1,1980,-0.09799,e,1.3
2,1980,-1.343753,f,0.1


# Stack and Unstack

![Stack](./pic/reshaping_stack.png)
![Unstack](./pic/reshaping_unstack.png)

In [13]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.858637,0.924552
bar,two,2.701723,-1.541327
baz,one,0.333957,-0.685638
baz,two,-0.763916,0.736411
foo,one,-1.631907,-1.07411
foo,two,-0.129327,0.108215
qux,one,-0.304164,0.51652
qux,two,-0.635694,1.062862


In [14]:
df = df.stack()
df

first  second   
bar    one     A    1.858637
               B    0.924552
       two     A    2.701723
               B   -1.541327
baz    one     A    0.333957
               B   -0.685638
       two     A   -0.763916
               B    0.736411
foo    one     A   -1.631907
               B   -1.074110
       two     A   -0.129327
               B    0.108215
qux    one     A   -0.304164
               B    0.516520
       two     A   -0.635694
               B    1.062862
dtype: float64

![unstack](./pic/reshaping_unstack_0.png)

In [15]:
df.unstack(level=0)

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,A,1.858637,0.333957,-1.631907,-0.304164
one,B,0.924552,-0.685638,-1.07411,0.51652
two,A,2.701723,-0.763916,-0.129327,-0.635694
two,B,-1.541327,0.736411,0.108215,1.062862


In [17]:
df.unstack(level=1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,1.858637,2.701723
bar,B,0.924552,-1.541327
baz,A,0.333957,-0.763916
baz,B,-0.685638,0.736411
foo,A,-1.631907,-0.129327
foo,B,-1.07411,0.108215
qux,A,-0.304164,-0.635694
qux,B,0.51652,1.062862


# Pivot
![pivot](./pic/reshaping_pivot.png)

In [18]:
data = {
    "value": np.random.randn(20),
    "variable": ["A"] * 10 + ["B"] * 10,
    "category": np.random.choice(["type1", "type2", "type3", "type4"], 20),
    "date": (
            list(pd.date_range("1/1/2000", periods=10, freq="M"))
            + list(pd.date_range("1/1/2000", periods=10, freq="M"))
    ),
}
df = pd.DataFrame(data, columns=["date", "variable", "category", "value"])
df.sample(5)

Unnamed: 0,date,variable,category,value
13,2000-04-30,B,type3,-1.836811
6,2000-07-31,A,type2,0.212516
4,2000-05-31,A,type3,1.631146
8,2000-09-30,A,type3,1.400936
12,2000-03-31,B,type1,-0.190758


In [23]:
df2 = df.pivot(index="date", columns="variable", values="value").shift(1)
df2

variable,A,B
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-31,,
2000-02-29,-0.988989,0.913258
2000-03-31,0.645503,0.676891
2000-04-30,0.688413,-0.190758
2000-05-31,1.115083,-1.836811
2000-06-30,1.631146,-0.299397
2000-07-31,-0.170198,-1.130065
2000-08-31,0.212516,-1.016637
2000-09-30,-0.914829,1.280708
2000-10-31,1.400936,0.252451


In [24]:
df2.unstack().reset_index()

Unnamed: 0,variable,date,0
0,A,2000-01-31,
1,A,2000-02-29,-0.988989
2,A,2000-03-31,0.645503
3,A,2000-04-30,0.688413
4,A,2000-05-31,1.115083
5,A,2000-06-30,1.631146
6,A,2000-07-31,-0.170198
7,A,2000-08-31,0.212516
8,A,2000-09-30,-0.914829
9,A,2000-10-31,1.400936
