# stack과 melt로 넓은 데이터를 긴 포맷으로 리셰이핑
---
- 변수명 뒤쪽에 기간(월 또는 연도)을 나타내는 부분이 붙어 있을 수도 있고
- 비슷한 이름의 변수들이 여러개 있는 경우도 있음
---
- stack이나 melt 중 어느 것을 사용해도 되지만, melt가 좀 더 유연함
- stack을 사용하면 열 이름이 모두 인덱스로 옮겨짐
- melt를 사용하면 인덱스가 아닌 id 변수를 기준으로 열 이름과 값을 회전할 수 있음 → id_var 매개변수를 사용, 어떠 ㄴ변수를 녹일지는 value_vars 매개변수로 지정
---
### NLS 데이터 사용
- 근무 주 수 데이터의 열 이름에서 연도를 추출 -> 넓은 데이터를 긴 데이터로 변환해보자.

In [1]:
import pandas as pd
nls97 = pd.read_csv('data/nls97f.csv')

In [2]:
nls97.set_index(['originalid'], inplace = True)

In [3]:
# 근무 주 수 값의 일부를 확인
weeksworkedcols = ['weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03', 'weeksworked04']

In [4]:
nls97[weeksworkedcols].head(2)

Unnamed: 0_level_0,weeksworked00,weeksworked01,weeksworked02,weeksworked03,weeksworked04
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8245,46.0,52.0,52.0,48.0,52.0
3962,5.0,49.0,52.0,52.0,52.0


In [5]:
nls97.shape

(8984, 89)

### stack을 사용해, 넓은 데이터를 긴 데이터로 변환

In [6]:
weeksworked = nls97[weeksworkedcols].stack(dropna=False).reset_index().rename(columns={'level_1':'year', 0:'weeksworked'})

In [7]:
weeksworked.head(10)

Unnamed: 0,originalid,year,weeksworked
0,8245,weeksworked00,46.0
1,8245,weeksworked01,52.0
2,8245,weeksworked02,52.0
3,8245,weeksworked03,48.0
4,8245,weeksworked04,52.0
5,3962,weeksworked00,5.0
6,3962,weeksworked01,49.0
7,3962,weeksworked02,52.0
8,3962,weeksworked03,52.0
9,3962,weeksworked04,52.0


### year 값 수정
- 연도 마지막 숫자를 정수로 변환 + 2000하기

In [8]:
weeksworked['year'] = weeksworked['year'].str[-2:].astype(int) + 2000

In [9]:
weeksworked.head(10)

Unnamed: 0,originalid,year,weeksworked
0,8245,2000,46.0
1,8245,2001,52.0
2,8245,2002,52.0
3,8245,2003,48.0
4,8245,2004,52.0
5,3962,2000,5.0
6,3962,2001,49.0
7,3962,2002,52.0
8,3962,2003,52.0
9,3962,2004,52.0


In [10]:
weeksworked.shape

(44920, 3)

### melt(회전)를 사용해 넓은 데이터를 길게 변환

In [11]:
weeksworked = nls97.reset_index().loc[:, ['originalid'] + weeksworkedcols].melt(id_vars=['originalid'],
                                                                 value_vars=weeksworkedcols,
                                                                 var_name='year',
                                                                 value_name='weeksworked')

In [12]:
weeksworked['year'] = weeksworked.year.str[-2:].astype(int) + 2000

In [13]:
weeksworked.set_index(['originalid'], inplace=True)

In [14]:
weeksworked.loc[[8245, 3962]]

Unnamed: 0_level_0,year,weeksworked
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1
8245,2000,46.0
8245,2001,52.0
8245,2002,52.0
8245,2003,48.0
8245,2004,52.0
3962,2000,5.0
3962,2001,49.0
3962,2002,52.0
3962,2003,52.0
3962,2004,52.0


### 대학 등록 열을 녹여서(melt) 리셰이핑

In [15]:
colenrcols = ['colenroct00', 'colenroct01', 'colenroct02', 'colenroct03', 'colenroct04']

In [16]:
colenr = nls97.reset_index().loc[:, ['originalid'] + colenrcols].melt(id_vars=['originalid'],
                                                            value_vars=colenrcols,
                                                            var_name='year',
                                                            value_name='colenr')

In [17]:
colenr['year'] = colenr.year.str[-2:].astype(int) + 2000

In [18]:
colenr.set_index('originalid', inplace=True)

In [19]:
colenr.loc[[8542, 3962]]

Unnamed: 0_level_0,year,colenr
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1
8542,2000,1. Not enrolled
8542,2001,1. Not enrolled
8542,2002,1. Not enrolled
8542,2003,1. Not enrolled
8542,2004,1. Not enrolled
3962,2000,1. Not enrolled
3962,2001,1. Not enrolled
3962,2002,1. Not enrolled
3962,2003,1. Not enrolled
3962,2004,1. Not enrolled


### 근무 주 수와 대학 등록 데이터 병합

In [20]:
workschool = pd.merge(weeksworked, colenr, on=['originalid', 'year'], how='inner')

In [21]:
workschool.shape

(44920, 3)

In [22]:
workschool.loc[[8245, 3962]]

Unnamed: 0_level_0,year,weeksworked,colenr
originalid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8245,2000,46.0,1. Not enrolled
8245,2001,52.0,1. Not enrolled
8245,2002,52.0,1. Not enrolled
8245,2003,48.0,1. Not enrolled
8245,2004,52.0,1. Not enrolled
3962,2000,5.0,1. Not enrolled
3962,2001,49.0,1. Not enrolled
3962,2002,52.0,1. Not enrolled
3962,2003,52.0,1. Not enrolled
3962,2004,52.0,1. Not enrolled
