 # Data Wrangling

In [1]:
import pandas as pd
import numpy as np

In [2]:
ser1 = pd.Series(np.random.uniform(size=9).round(2),index=[list("aaabbccdd"),[1,2,3,2,3,1,2,1,3]])
ser1

a  1    0.35
   2    0.17
   3    0.97
b  2    0.34
   3    0.10
c  1    0.22
   2    0.86
d  1    0.21
   3    0.53
dtype: float64

In [3]:
ser1.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 1),
            ('d', 3)],
           )

 ### Selecting data in a MultiIndex Series

In [4]:
ser1.loc["a"]

1    0.35
2    0.17
3    0.97
dtype: float64

In [5]:
ser1.loc[["a","d"]]

a  1    0.35
   2    0.17
   3    0.97
d  1    0.21
   3    0.53
dtype: float64

In [6]:
ser1.loc["a", 1]
#OR
ser1["a"][1]

0.35

In [7]:
ser1.loc["b":"d"]

b  2    0.34
   3    0.10
c  1    0.22
   2    0.86
d  1    0.21
   3    0.53
dtype: float64

In [8]:
# Select only the inner level:
ser1.loc[:, 2]

a    0.17
b    0.34
c    0.86
dtype: float64

 Note: Hierarchical indexing can be used to form a pivot table.

In [9]:
ser1.unstack()

Unnamed: 0,1,2,3
a,0.35,0.17,0.97
b,,0.34,0.1
c,0.22,0.86,
d,0.21,,0.53


In [10]:
# Inverse of unstack is:
ser1.unstack().stack()

a  1    0.35
   2    0.17
   3    0.97
b  2    0.34
   3    0.10
c  1    0.22
   2    0.86
d  1    0.21
   3    0.53
dtype: float64

 Note: Either axis can have MultiIndex or hierarchical index.

In [11]:
df1 = pd.DataFrame(
  np.random.standard_normal(size=(4,3)).round(3),
  index=[list("aabb"),[1,2,1,2]],
  columns=[["Ohio", "Ohio", "Texas"],["blue", "red", "blue"]]
)
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Texas
Unnamed: 0_level_1,Unnamed: 1_level_1,blue,red,blue
a,1,-0.44,0.084,0.325
a,2,2.179,0.097,0.441
b,1,0.216,-1.191,1.309
b,2,1.359,1.178,0.813


In [12]:
df1.index.names = ["key1","key2"]
df1.columns.names = ["state","color"]
df1

Unnamed: 0_level_0,state,Ohio,Ohio,Texas
Unnamed: 0_level_1,color,blue,red,blue
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,-0.44,0.084,0.325
a,2,2.179,0.097,0.441
b,1,0.216,-1.191,1.309
b,2,1.359,1.178,0.813


In [13]:
# check the levels of index or columns
print(f"Index levels: {df1.index.nlevels}")
print(f"Column levels: {df1.columns.nlevels}")

Index levels: 2
Column levels: 2


In [14]:
# Subset of the data:
df1["Ohio"].loc["b"]

color,blue,red
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.216,-1.191
2,1.359,1.178


 A more intuitive way to create MultiIndex:

In [15]:
pd.MultiIndex.from_arrays(
  [["Ohio","Ohio","Texas"],["blue","red","blue"]],
  names=["state","color"]
)

MultiIndex([( 'Ohio', 'blue'),
            ( 'Ohio',  'red'),
            ('Texas', 'blue')],
           names=['state', 'color'])

 Note: Sometimes we may require to swap or change the order of levels in a MultiIndex

In [16]:
df1.swaplevel(0,1)
# OR if they have names:
df1.swaplevel("key1","key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Texas
Unnamed: 0_level_1,color,blue,red,blue
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,-0.44,0.084,0.325
2,a,2.179,0.097,0.441
1,b,0.216,-1.191,1.309
2,b,1.359,1.178,0.813


 Note: The data remains unaltered

 `sort_index` by defaults sorts data lexicographically using all index levels, but we can specify a single level or a subset of levels.

In [17]:
# level= Name (str) || Number (int) || List
# Example: level="key2" || level=1
df1.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Texas
Unnamed: 0_level_1,color,blue,red,blue
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,-0.44,0.084,0.325
b,1,0.216,-1.191,1.309
a,2,2.179,0.097,0.441
b,2,1.359,1.178,0.813


In [18]:
df1.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Texas
Unnamed: 0_level_1,color,blue,red,blue
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,-0.44,0.084,0.325
1,b,0.216,-1.191,1.309
2,a,2.179,0.097,0.441
2,b,1.359,1.178,0.813


In [19]:
df1.groupby(level="key2").sum()
# OR
df1.groupby(level=1).sum()

state,Ohio,Ohio,Texas
color,blue,red,blue
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,-0.224,-1.107,1.634
2,3.538,1.275,1.254


In [20]:
df1.groupby(level="color",axis=1).sum()

Unnamed: 0_level_0,color,blue,red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.115,0.084
a,2,2.62,0.097
b,1,1.525,-1.191
b,2,2.172,1.178


 ### Create index using a column in a DataFrame

In [21]:
df2 = pd.DataFrame({
  "a": np.random.standard_normal(7).round(2),
  "b": np.random.standard_normal(7).round(2),
  "c": ["one", "one", "one", "two", "two", "three", "three"],
  "d": [1,2,3,1,2,2,3]
})
df2

Unnamed: 0,a,b,c,d
0,1.61,0.33,one,1
1,0.29,0.93,one,2
2,-0.62,-0.55,one,3
3,-0.83,0.9,two,1
4,-0.01,0.3,two,2
5,-0.36,-0.63,three,2
6,1.1,-0.38,three,3


In [22]:
# Setting a MultiIndex using two cols
df3 = df2.set_index(["c","d"])
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,1.61,0.33
one,2,0.29,0.93
one,3,-0.62,-0.55
two,1,-0.83,0.9
two,2,-0.01,0.3
three,2,-0.36,-0.63
three,3,1.1,-0.38


 Note: By default the set_index method drops the column values

In [23]:
df2.set_index(["c","d"],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,1,1.61,0.33,one,1
one,2,0.29,0.93,one,2
one,3,-0.62,-0.55,one,3
two,1,-0.83,0.9,two,1
two,2,-0.01,0.3,two,2
three,2,-0.36,-0.63,three,2
three,3,1.1,-0.38,three,3


In [24]:
df3.reset_index()

Unnamed: 0,c,d,a,b
0,one,1,1.61,0.33
1,one,2,0.29,0.93
2,one,3,-0.62,-0.55
3,two,1,-0.83,0.9
4,two,2,-0.01,0.3
5,three,2,-0.36,-0.63
6,three,3,1.1,-0.38


 ## Combining & Merging Datasets

 Here are few ways to combine data:

 - `pd.merge`: Connect rows in DataFrames based on one or more keys, similar to SQL JOIN operation
 - `pd.concate`: Concatenate or "stack" DataFrames along an axis
 - `pd.combine_first`: Splice together overlapping data to fill in missing values in one df with values from other df

In [25]:
df4 = pd.DataFrame({
  "key": list("abbcaab"),
  "data": np.random.standard_normal(7).round(2)
})
df4

Unnamed: 0,key,data
0,a,1.03
1,b,-1.35
2,b,-1.92
3,c,0.53
4,a,0.75
5,a,0.89
6,b,-0.55


In [26]:
df5 = pd.DataFrame({
  "key": list("cad"),
  "data": np.random.standard_normal(3).round(2)
})
df5

Unnamed: 0,key,data
0,c,1.06
1,a,-1.35
2,d,-0.92


In [27]:
pd.merge(left=df4,right=df5,on="key")

Unnamed: 0,key,data_x,data_y
0,a,1.03,-1.35
1,a,0.75,-1.35
2,a,0.89,-1.35
3,c,0.53,1.06


 Note: The merge method drops the keys which aren't included in both objects, it defaults to inner join meaning that it is an intersection of both data tables.

 This is an example of a many-to-one join; the data in `df4` has multiple rows of `a`, whereas `df5` has only a single value of `a` and `c`.

 Also Note: The column `data` got renamed by adding a suffix.

In [28]:
pd.merge(df4,df5,on="key",how="outer")

Unnamed: 0,key,data_x,data_y
0,a,1.03,-1.35
1,a,0.75,-1.35
2,a,0.89,-1.35
3,b,-1.35,
4,b,-1.92,
5,b,-0.55,
6,c,0.53,1.06
7,d,,-0.92


 |Option|Behavior|
 |---|---|
 |`how="inner"`|only the key combinations found on both tables|
 |`how="left"`|key combinations found on left table|
 |`how="right"`|key combinations found on right table|
 |`how="outer"`|all the key combinations found on both table|

 Here is an diagram to illustrate how two tables are merged:

 ![pd.merge, how parameter](imgs/types-of-join.png)

 Here is an example of many-to-many joining of tables:

In [29]:
df6 = pd.DataFrame({
  "key": list("ababca"),
  "data": pd.Series(np.arange(6),dtype="Int64")
})
df6

Unnamed: 0,key,data
0,a,0
1,b,1
2,a,2
3,b,3
4,c,4
5,a,5


In [30]:
df7 = pd.DataFrame({
  "key": list("aadbb"),
  "data": pd.Series(np.arange(5),dtype="Int64")
})
df7

Unnamed: 0,key,data
0,a,0
1,a,1
2,d,2
3,b,3
4,b,4


In [31]:
pd.merge(df6,df7,on="key")

Unnamed: 0,key,data_x,data_y
0,a,0,0
1,a,0,1
2,a,2,0
3,a,2,1
4,a,5,0
5,a,5,1
6,b,1,3
7,b,1,4
8,b,3,3
9,b,3,4


 Note: The above is the Cartesian product of the matching keys.

 Since there are three "a" rows in the left table and two in the right, the result is the product, that is six "a" rows.

In [32]:
df8 = pd.DataFrame({
  "key1": ["one", "two", "two", "three", "four"],
  "key2": list("abdcd"),
  "data": np.random.standard_normal(5).round(3)
})
df9 = pd.DataFrame({
  "key1": ["one", "three", "two"],
  "key2": list("acd"),
  "data": np.random.standard_normal(3).round(3)
})

pd.merge(df8,df9,on=["key1","key2"],how="outer")

Unnamed: 0,key1,key2,data_x,data_y
0,one,a,-0.995,-0.477
1,two,b,1.236,
2,two,d,1.316,-1.159
3,three,c,0.146,-1.203
4,four,d,1.048,


 ### Merging on Index

In [33]:
left1 = pd.DataFrame({
  "key":list("abaabc"),
  "data": np.random.standard_normal(6).round(3)
})
left1

Unnamed: 0,key,data
0,a,1.361
1,b,-0.494
2,a,-0.525
3,a,-0.235
4,b,-0.059
5,c,1.587


In [34]:
right1 = pd.DataFrame({
  "data": np.random.standard_normal(2).round(3)
}, index=["a", "c"])
right1

Unnamed: 0,data
a,-0.315
c,-2.376


In [35]:
pd.merge(left1,right1,left_on="key",right_index=True)

Unnamed: 0,key,data_x,data_y
0,a,1.361,-0.315
2,a,-0.525,-0.315
3,a,-0.235,-0.315
5,c,1.587,-2.376


In [36]:
pd.merge(left1,right1,left_on="key",right_index=True,how="outer")

Unnamed: 0,key,data_x,data_y
0,a,1.361,-0.315
2,a,-0.525,-0.315
3,a,-0.235,-0.315
1,b,-0.494,
4,b,-0.059,
5,c,1.587,-2.376


 Note: If we want to use index of a table as the key, then use left_index or right_index parameter accordingly.

 The following is an example of how to merge two tables, one of which has its keys as a MultiIndex:

In [37]:
left2 = pd.DataFrame({
  "state": ["MH", "MH", "UK", "UK", "MH"],
  "year": [2000,2001,2000,2001,2002],
  "data": np.random.standard_normal(5).round(3)
})
left2

Unnamed: 0,state,year,data
0,MH,2000,-1.007
1,MH,2001,0.327
2,UK,2000,0.874
3,UK,2001,-0.325
4,MH,2002,0.945


In [38]:
right_index = pd.MultiIndex.from_arrays([
  ["MH","UK","UK","UK","MH"],
  [2000,2000,2001,2002,2001]
])

In [39]:
right2 = pd.DataFrame({
  "event1": pd.Series(np.random.standard_normal(5).round(3),index=right_index).sort_index(level=0),
  "event2": pd.Series(np.random.standard_normal(5).round(3),index=right_index).sort_index(level=0)
})
right2

Unnamed: 0,Unnamed: 1,event1,event2
MH,2000,-0.013,0.833
MH,2001,0.812,-0.742
UK,2000,-0.841,0.789
UK,2001,-0.188,-0.243
UK,2002,0.295,0.307


In [40]:
pd.merge(left2,right2,how="inner",left_on=["state","year"],right_index=True)

Unnamed: 0,state,year,data,event1,event2
0,MH,2000,-1.007,-0.013,0.833
1,MH,2001,0.327,0.812,-0.742
2,UK,2000,0.874,-0.841,0.789
3,UK,2001,-0.325,-0.188,-0.243


 Merging using two indices as keys is also possible:

In [41]:
def random_numbers(n):
  return (np.random.standard_normal(n) * 8).round(3)

In [42]:
left3 = pd.DataFrame({
  "NY": random_numbers(4),
  "LA": random_numbers(4)
},index=list("abcd"))
left3

Unnamed: 0,NY,LA
a,4.825,11.356
b,-5.033,-4.005
c,0.983,8.53
d,2.957,3.684


In [43]:
right3 = pd.DataFrame({
  "WA": random_numbers(3),
  "SF": random_numbers(3)
},index=list("abc"))
right3

Unnamed: 0,WA,SF
a,-2.241,-2.206
b,8.139,11.485
c,3.922,12.418


In [44]:
pd.merge(left3,right3,how="outer",right_index=True,left_index=True)

Unnamed: 0,NY,LA,WA,SF
a,4.825,11.356,-2.241,-2.206
b,-5.033,-4.005,8.139,11.485
c,0.983,8.53,3.922,12.418
d,2.957,3.684,,


 DataFrame has a `join` method, to simplify merging by index:

In [45]:
left3.join(right3,how="outer")

Unnamed: 0,NY,LA,WA,SF
a,4.825,11.356,-2.241,-2.206
b,-5.033,-4.005,8.139,11.485
c,0.983,8.53,3.922,12.418
d,2.957,3.684,,


 Note: `join` method performs a left join on the key (on=index).

In [46]:
left1

Unnamed: 0,key,data
0,a,1.361
1,b,-0.494
2,a,-0.525
3,a,-0.235
4,b,-0.059
5,c,1.587


In [47]:
right1

Unnamed: 0,data
a,-0.315
c,-2.376


In [51]:
left1.join(right1,on="key",lsuffix="_x",rsuffix="_y").sort_values("key").reset_index(drop=True)

Unnamed: 0,key,data_x,data_y
0,a,1.361,-0.315
1,a,-0.525,-0.315
2,a,-0.235,-0.315
3,b,-0.494,
4,b,-0.059,
5,c,1.587,-2.376


In [52]:
df10 = pd.DataFrame({
  "SE": random_numbers(2),
  "DT": random_numbers(2)
},index=list("bd"))
df10

Unnamed: 0,SE,DT
b,-5.198,11.587
d,-10.399,-8.632


In [58]:
cities = left3.join([right3,df10],how="outer")
cities.fillna(cities.mean(axis=0)).round(3)

Unnamed: 0,NY,LA,WA,SF,SE,DT
a,4.825,11.356,-2.241,-2.206,-7.798,1.478
b,-5.033,-4.005,8.139,11.485,-5.198,11.587
c,0.983,8.53,3.922,12.418,-7.798,1.478
d,2.957,3.684,3.273,7.232,-10.399,-8.632


In [59]:
d1 = np.arange(12).reshape(4,3)
np.concatenate([d1,d1],axis=1)

array([[ 0,  1,  2,  0,  1,  2],
       [ 3,  4,  5,  3,  4,  5],
       [ 6,  7,  8,  6,  7,  8],
       [ 9, 10, 11,  9, 10, 11]])

In [60]:
s1 = pd.Series(np.arange(2),index=list("ab"),dtype="Int64")
s2 = pd.Series(np.arange(3),index=list("bcd"),dtype="Int64")
s3 = pd.Series(np.arange(2),index=list("de"),dtype="Int64")

In [61]:
pd.concat([s1,s2,s3],axis=0)

a    0
b    1
b    0
c    1
d    2
d    0
e    1
dtype: Int64

In [62]:
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,0.0,
c,,1.0,
d,,2.0,0.0
e,,,1.0


 ### Hierarchical index on concatenation

In [63]:
h_ser1 = pd.concat([s1,s2,s3],axis=0,keys=["one","two","three"])
h_ser1

one    a    0
       b    1
two    b    0
       c    1
       d    2
three  d    0
       e    1
dtype: Int64

In [64]:
h_ser1.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,,,
two,,0.0,1.0,2.0,
three,,,,0.0,1.0


 Note: While concatenating on along `axis="column"`, the keys become the column headers.

In [65]:
pd.concat([s1,s2,s3],axis=1,keys=["one","two","three"])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,0.0,
c,,1.0,
d,,2.0,0.0
e,,,1.0


 We can also concatenate DataFrames:

In [66]:
df11 = pd.DataFrame({
  "one": random_numbers(3),
  "two": random_numbers(3)
},index=list("abc"))

df12 = pd.DataFrame({
  "three": random_numbers(2),
  "four": random_numbers(2)
},index=list("ac"))

pd.concat([df11,df12],axis=1,join="outer",keys=["lvl1","lvl2"])

Unnamed: 0_level_0,lvl1,lvl1,lvl2,lvl2
Unnamed: 0_level_1,one,two,three,four
a,-10.627,10.037,1.58,-2.431
b,-4.252,-7.953,,
c,-1.065,14.012,-4.025,-0.762


 Note: Here the `keys` arg is used to create a hierarchical column.

 We can even pass in a dict in the concat method, the dict keys will be used for keys option.

In [67]:
pd.concat({
  "lvl1": df11,
  "lvl2": df12
},axis=1)

Unnamed: 0_level_0,lvl1,lvl1,lvl2,lvl2
Unnamed: 0_level_1,one,two,three,four
a,-10.627,10.037,1.58,-2.431
b,-4.252,-7.953,,
c,-1.065,14.012,-4.025,-0.762


In [68]:
df13 = pd.DataFrame(random_numbers((3,4)),columns=list("abcd"))
df14 = pd.DataFrame(random_numbers((2,3)),columns=list("bca"))

pd.concat([df13,df14],axis=0,ignore_index=True)

Unnamed: 0,a,b,c,d
0,7.346,-5.713,12.393,-3.027
1,1.203,2.441,10.009,0.565
2,-11.194,-7.053,-6.433,-6.561
3,-0.638,5.082,-12.197,
4,12.297,-11.342,10.323,


 ### Combine Data with Overlap

In [70]:
ser2 = pd.Series(
  [np.nan,2.2,1.5,8.3,4.8,np.nan],
  index=list("cabdfe")
)

ser3 = pd.Series(
  [6.8,np.nan,7.6,np.nan,4.1,3.9],
  index=list("abcdef")
)

 Iterates over `ser2`, if `nan` value encountered, take value from `ser3`:

In [71]:
# This is the main logic behind the combine_first method:
np.where(pd.isna(ser2),ser3,ser2)

array([6.8, 2.2, 1.5, 8.3, 4.8, 3.9])

In [72]:
ser2.combine_first(ser3)

a    2.2
b    1.5
c    7.6
d    8.3
e    4.1
f    4.8
dtype: float64

 Note: There is a difference in the values, because `combine_first` method first aligns both the Series by index.

In [73]:
df15 = pd.DataFrame({
  "a": [1.2,np.nan,0.7],
  "b": [np.nan,8.1,6.9],
  "c": [2.2,4.1,2.7]
})

df16 = pd.DataFrame({
  "a": [np.nan,1.8,0.4,3.2],
  "b": [1.7,5.4,np.nan,9.1]
})

In [76]:
df15.combine_first(df16)

Unnamed: 0,a,b,c
0,1.2,1.7,2.2
1,1.8,8.1,4.1
2,0.7,6.9,2.7
3,3.2,9.1,


 Note: Unlike the `np.where`, `combine_first` doesn't require the lengths of the two objects to be equal.

 The output of `combine_fitst` with DataFrame objects will have the union of all the column names.

 ## Reshaping & Pivoting

 Two primary methods of pivoting the data using pandas is:
 * `stack`: rotates or pivots from columns into rows
 * `unstack`: pivots from rows into columns

In [77]:
df17 = pd.DataFrame(
  np.random.standard_normal((2,3)).round(2),
  index=pd.Index(["M&M","ITC"],name="stock"),
  columns=pd.Index(["one","two","three"],name="day")
)
df17

day,one,two,three
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M&M,1.13,0.49,-1.21
ITC,0.54,0.52,0.69


In [78]:
# Creates a multiIndex DataFrame
res = df17.stack()
res

stock  day  
M&M    one      1.13
       two      0.49
       three   -1.21
ITC    one      0.54
       two      0.52
       three    0.69
dtype: float64

In [79]:
res.unstack()

day,one,two,three
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M&M,1.13,0.49,-1.21
ITC,0.54,0.52,0.69


 By default the inner most level index is unstacked, same with `stack`.
 We can `unstack` different level like this:

In [80]:
res.unstack(level=0)

stock,M&M,ITC
day,Unnamed: 1_level_1,Unnamed: 2_level_1
one,1.13,0.54
two,0.49,0.52
three,-1.21,0.69


In [81]:
s4 = pd.Series(np.arange(4),index=list("abcd"),dtype="Int64")
s5 = pd.Series(np.arange(3,6),index=list("cde"),dtype="Int64")

s6 = pd.concat([s4,s5],keys=["one","two"])
s6

one  a    0
     b    1
     c    2
     d    3
two  c    3
     d    4
     e    5
dtype: Int64

In [82]:
s6.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2,3,
two,,,3,4,5.0


In [83]:
s6.unstack().stack(dropna=False)

one  a       0
     b       1
     c       2
     d       3
     e    <NA>
two  a    <NA>
     b    <NA>
     c       3
     d       4
     e       5
dtype: Int64

In [84]:
res

stock  day  
M&M    one      1.13
       two      0.49
       three   -1.21
ITC    one      0.54
       two      0.52
       three    0.69
dtype: float64

In [85]:
df19 = pd.DataFrame({
  "left": res,
  "right": res * 2
})
df19.columns.name = "side"
df19

Unnamed: 0_level_0,side,left,right
stock,day,Unnamed: 2_level_1,Unnamed: 3_level_1
M&M,one,1.13,2.26
M&M,two,0.49,0.98
M&M,three,-1.21,-2.42
ITC,one,0.54,1.08
ITC,two,0.52,1.04
ITC,three,0.69,1.38


In [86]:
df20 = df19.unstack(level="stock")
df20

side,left,left,right,right
stock,M&M,ITC,M&M,ITC
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1.13,0.54,2.26,1.08
two,0.49,0.52,0.98,1.04
three,-1.21,0.69,-2.42,1.38


In [87]:
df20.stack(level="side")

Unnamed: 0_level_0,stock,ITC,M&M
day,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0.54,1.13
one,right,1.08,2.26
two,left,0.52,0.49
two,right,1.04,0.98
three,left,0.69,-1.21
three,right,1.38,-2.42


 ### Pivoting "Long" to "Wide" Format

In [88]:
mcdata = pd.read_csv("examples/macrodata.csv")
mcdata = mcdata.loc[:,["year","quarter","realgdp","infl","unemp"]]
mcdata.head()

Unnamed: 0,year,quarter,realgdp,infl,unemp
0,1959.0,1.0,2710.349,0.0,5.8
1,1959.0,2.0,2778.801,2.34,5.1
2,1959.0,3.0,2775.488,2.74,5.3
3,1959.0,4.0,2785.204,0.27,5.6
4,1960.0,1.0,2847.699,2.31,5.2


In [89]:
periods = pd.PeriodIndex(
  year=mcdata.pop("year"),
  quarter=mcdata.pop("quarter"),
  name="date"
)
periods[:5]

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1'], dtype='period[Q-DEC]', name='date')

 Note: Used `pop` method to remove the column data at the same time.

In [90]:
mcdata.index = periods.to_timestamp("D")
mcdata.columns.name = "eco_metrics"
mcdata.head()

eco_metrics,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.0,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2


In [91]:
mcdata.stack()

date        eco_metrics
1959-01-01  realgdp         2710.349
            infl               0.000
            unemp              5.800
1959-04-01  realgdp         2778.801
            infl               2.340
                             ...    
2009-04-01  infl               3.370
            unemp              9.200
2009-07-01  realgdp        12990.341
            infl               3.560
            unemp              9.600
Length: 609, dtype: float64

In [92]:
# last column name is renamed to "value"
long_data = mcdata.stack().reset_index().rename(columns={0:"value"})
long_data.head()

Unnamed: 0,date,eco_metrics,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.0
2,1959-01-01,unemp,5.8
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.34


 Note: In the long format, each row represents a single observation.

In [93]:
long_data.pivot(index="date", columns="eco_metrics", values="value").head()
# the same as this:
# long_data.set_index(["date","eco_metrics"]).unstack()

eco_metrics,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


In [94]:
long_data["noise"] = np.random.standard_normal(len(long_data)).round(3)
long_data.head()

Unnamed: 0,date,eco_metrics,value,noise
0,1959-01-01,realgdp,2710.349,0.448
1,1959-01-01,infl,0.0,-1.263
2,1959-01-01,unemp,5.8,0.287
3,1959-04-01,realgdp,2778.801,1.096
4,1959-04-01,infl,2.34,1.594


In [95]:
long_data.pivot(index="date",columns="eco_metrics").head()

Unnamed: 0_level_0,value,value,value,noise,noise,noise
eco_metrics,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,-1.263,0.448,0.287
1959-04-01,2.34,2778.801,5.1,1.594,1.096,0.365
1959-07-01,2.74,2775.488,5.3,0.296,-1.048,0.895
1959-10-01,0.27,2785.204,5.6,1.264,-0.003,0.246
1960-01-01,2.31,2847.699,5.2,-1.439,0.009,-0.039


 ### Pivoting "Wide" to "Long" Format

In [104]:
df18 = pd.DataFrame(
  np.arange(9).reshape((3,3)),
  columns=list("ABC"),
  index=pd.Index(["foo","bar","baz"],name="key")
)
df18 = df18.reset_index()
df18

Unnamed: 0,key,A,B,C
0,foo,0,1,2
1,bar,3,4,5
2,baz,6,7,8


In [105]:
melted = pd.melt(df18,id_vars="key")
melted

Unnamed: 0,key,variable,value
0,foo,A,0
1,bar,A,3
2,baz,A,6
3,foo,B,1
4,bar,B,4
5,baz,B,7
6,foo,C,2
7,bar,C,5
8,baz,C,8


In [106]:
melted.pivot(index="key",columns="variable",values="value")

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,4,5
baz,6,7,8
foo,0,1,2


 Specify a subset of columns to use as value columns:

In [107]:
pd.melt(df18,id_vars="key",value_vars=["A","C"])

Unnamed: 0,key,variable,value
0,foo,A,0
1,bar,A,3
2,baz,A,6
3,foo,C,2
4,bar,C,5
5,baz,C,8


 Use without an id:

In [108]:
pd.melt(df18,value_vars=["key","B"])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,B,1
4,B,4
5,B,7


In [109]:
pd.melt(df18)

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,0
4,A,3
5,A,6
6,B,1
7,B,4
8,B,7
9,C,2
