In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.testing import assert_frame_equal
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
p1 = Path.cwd() / 'back_data'

In [2]:
# 정돈된(tidy) 데이터 -> 각 변수는 하나의 열을 구성, 각 관측치는 행을 구성, 각 관측 단위가 하나의 테이블 구성
# 데이터 정돈 -> 정돈된 데이터 원칙에 맞게 데이터의 모양이나 구조를 변경
# stack(), melt(), unstack(), pivot(), str 접근자, rename(), rename_axis(), reset_index(), set_index() 메서드 등 사용 가능
state_fruit = pd.read_csv(p1 / 'state_fruit.csv', index_col=0)
state_fruit

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [3]:
(state_fruit
# 각 변수가 하나의 열을 구성하도록 stack() 메서드 활용
# stack() 사용 시 변환하지 않으려는 모든 열을 인덱스에 미치 배치해야(state_fruit DataFrame에서는 index_col=0으로 미리 진행)
.stack()
# 인덱스 열에 해당하는 열 이름을 지정
.rename_axis(['state', 'fruit'])
# reset_index() 메서드 활용하면서 name 파라미터를 통해 값의 열 이름 지정
.reset_index(name='weight')
)

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [4]:
# melt() 메서드를 사용해 stack()보다 좀 더 유연성 있게 진행 가능
state_fruit2 = pd.read_csv(p1 / 'state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [5]:
(state_fruit2
# melt() 메서드의 파라미터 활용 -> id_vars: id 키가 되는 변수, var_name: 기존 열이 값으로 배치되면서 들어가는 열 이름, value_name: 테이블 값들의 열 이름
# melt() 메서드 사용의 경우 인덱스는 자동으로 RangeIndex로 대체
.melt(id_vars='State', var_name='Fruit', value_name='Weight')
)

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [6]:
# pd.wide_to_long() 함수를 사용해 데이터를 정돈된 형태로 재구성
movie = pd.read_csv(p1 / 'movie.csv')
actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name',
               'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [7]:
# '_n' 형태로 열 이름을 변경하기 위해 함수 작성
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    # 열 이름에 'facebook'이 있을 경우 몇 번째인지를 찾아서 이동 후 재결합
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = (col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx - 1])
    return col_name
actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [11]:
stubnames = ['actor', 'actor_facebook_likes']
# pd.wide_to_long(df) 함수의 파라미터 -> stubnames(열 이름 중 남겨둘 이름), i(인덱스로 고정시킬 열), j(sep 이후에 배치될 열 이름), sep(구분자)
# suffix 파라미터 추가 활용 가능 -> 기본 값은 r'\d+'
# stubnames로 시작하는 모든 열은 단일 열로 쌓임
(pd.wide_to_long(actor2,
stubnames=stubnames,
i=['movie_title'],
j='actor_num',
sep='_'
)
.head()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,actor,actor_facebook_likes
movie_title,actor_num,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,1,CCH Pounder,1000.0
Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
Spectre,1,Christoph Waltz,11000.0
The Dark Knight Rises,1,Tom Hardy,27000.0
Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


In [14]:
# pd.wide_to_long() 함수를 이용한 새로운 연습 -> suffix 파라미터 활용
stackme = pd.read_csv(p1 / 'stackme.csv')
stackme2 = stackme.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2', 'd':'group2_a1', 'e':'group2_b2'})
stackme2

Unnamed: 0,State,Country,group1_a1,group1_b2,Test,group2_a1,group2_b2
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [18]:
(pd.wide_to_long(stackme2,
stubnames=['group1', 'group2'],
# i(index)의 경우 리스트로 묶어줘야
i=['State', 'Country', 'Test'],
j='Group_num',
sep='_',
suffix=r'[a-zA-Z]\d+')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,group1,group2
State,Country,Test,Group_num,Unnamed: 4_level_1,Unnamed: 5_level_1
TX,US,Test1,a1,0.45,2
TX,US,Test1,b2,0.3,6
MA,US,Test2,a1,0.03,9
MA,US,Test2,b2,1.2,7
ON,CAN,Test3,a1,0.7,4
ON,CAN,Test3,b2,4.2,2


In [19]:
# stack()과 melt() 메서드의 경우 각각 unstack()과 pivot() 메서드로 되돌리기 가능
# melt()와 pivot()은 재구성할 열을 선택할 수 있는 유연성 제공
# DataFrame을 읽어들일 시 사용할 열을 설정 -> usecols 파라미터에 함수 지정
def usecol_func(name):
    return 'UGDS_' in name or name == 'INSTNM'
# 만들어진 함수 대신 lambda 식으로도 사용 가능
college = pd.read_csv(p1 / 'college.csv', index_col='INSTNM', usecols=usecol_func)
college.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [22]:
# stack() 메서드에 dropna=False 인자 전달하여 결측치 제외 없이 스택 가능 -> stack()은 기본적으로 결측치 삭제
college_stacked = college.stack(dropna=False)
# unstack() 메서드 활용하여 되돌리기
college_stacked.unstack().head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [23]:
# melt() 및 pivot() 메서드를 활용하기 위해 인덱스 미지정(기본 RangeIndex 활용)
college2 = pd.read_csv(p1 / 'college.csv', usecols=usecol_func)
college2.head()

Unnamed: 0,INSTNM,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
2,Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [31]:
college_melted = college2.melt(id_vars='INSTNM', var_name='RACE', value_name='PERCENTAGE')
# melt() 이후 pivot()으로 되돌릴 경우 기관 이름이 인덱스로 이동하며 순서도 원래와 달라지는 문제 -> reindex() 메서드 활용 가능
(college_melted
.pivot(index='INSTNM', columns='RACE', values='PERCENTAGE')
# reindex() 메서드를 통해 원래 인덱스 순서 전달
.reindex(college2['INSTNM'])
# reindex() 메서드에 axis=1 인자 전달하면서 열 순서 전달 -> 'INSTNM'은 인덱스로 빠졌으므로 두 번째 열부터 전달해야
.reindex(college2.columns[1:], axis=1)
.head()
)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [32]:
# groupby 작업은 기본적으로 그룹화 열을 인덱스로 배치 -> unstack() 메서드를 활용해 정리 가능
employee = pd.read_csv(p1 / 'employee.csv')
employee.head()

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22


In [33]:
(employee
# 그룹화 열이 'RACE'와 'GENDER' -> 두 열이 인덱스로 배치
.groupby(['RACE', 'GENDER'])['BASE_SALARY']
.agg('mean')
.astype('int')
# unstack() 메서드를 통해 두 번째 인덱스 레벨을 열로 전환
.unstack()
)

RACE
American Indian or Alaskan Native    60272
Asian/Pacific Islander               61660
Black or African American            50137
Hispanic/Latino                      52345
Others                               51278
White                                64419
Name: BASE_SALARY, dtype: int32