In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.testing import assert_frame_equal
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
p1 = Path.cwd() / 'back_data'

In [2]:
# 정돈된(tidy) 데이터 -> 각 변수는 하나의 열을 구성, 각 관측치는 행을 구성, 각 관측 단위가 하나의 테이블 구성
# 데이터 정돈 -> 정돈된 데이터 원칙에 맞게 데이터의 모양이나 구조를 변경
# stack(), melt(), unstack(), pivot(), str 접근자, rename(), rename_axis(), reset_index(), set_index() 메서드 등 사용 가능
state_fruit = pd.read_csv(p1 / 'state_fruit.csv', index_col=0)
state_fruit

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [3]:
(state_fruit
# 각 변수가 하나의 열을 구성하도록 stack() 메서드 활용
# stack() 사용 시 변환하지 않으려는 모든 열을 인덱스에 미치 배치해야(state_fruit DataFrame에서는 index_col=0으로 미리 진행)
.stack()
# 인덱스 열에 해당하는 열 이름을 지정
.rename_axis(['state', 'fruit'])
# reset_index() 메서드 활용하면서 name 파라미터를 통해 값의 열 이름 지정
.reset_index(name='weight')
)

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [4]:
# melt() 메서드를 사용해 stack()보다 좀 더 유연성 있게 진행 가능
state_fruit2 = pd.read_csv(p1 / 'state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [5]:
(state_fruit2
# melt() 메서드의 파라미터 활용 -> id_vars: id 키가 되는 변수, var_name: 기존 열이 값으로 배치되면서 들어가는 열 이름, value_name: 테이블 값들의 열 이름
# melt() 메서드 사용의 경우 인덱스는 자동으로 RangeIndex로 대체
.melt(id_vars='State', var_name='Fruit', value_name='Weight')
)

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [6]:
# pd.wide_to_long() 함수를 사용해 데이터를 정돈된 형태로 재구성
movie = pd.read_csv(p1 / 'movie.csv')
actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name',
               'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [7]:
# '_n' 형태로 열 이름을 변경하기 위해 함수 작성
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    # 열 이름에 'facebook'이 있을 경우 몇 번째인지를 찾아서 이동 후 재결합
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = (col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx - 1])
    return col_name
actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [8]:
stubnames = ['actor', 'actor_facebook_likes']
# pd.wide_to_long(df) 함수의 파라미터 -> stubnames(열 이름 중 남겨둘 이름), i(인덱스로 고정시킬 열), j(sep 이후에 배치될 열 이름), sep(구분자)
# suffix 파라미터 추가 활용 가능 -> 기본 값은 r'\d+'
# stubnames로 시작하는 모든 열은 단일 열로 쌓임
(pd.wide_to_long(actor2,
stubnames=stubnames,
i=['movie_title'],
j='actor_num',
sep='_'
)
.head()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,actor,actor_facebook_likes
movie_title,actor_num,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,1,CCH Pounder,1000.0
Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
Spectre,1,Christoph Waltz,11000.0
The Dark Knight Rises,1,Tom Hardy,27000.0
Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


In [9]:
# pd.wide_to_long() 함수를 이용한 새로운 연습 -> suffix 파라미터 활용
stackme = pd.read_csv(p1 / 'stackme.csv')
stackme2 = stackme.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2', 'd':'group2_a1', 'e':'group2_b2'})
stackme2

Unnamed: 0,State,Country,group1_a1,group1_b2,Test,group2_a1,group2_b2
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [10]:
(pd.wide_to_long(stackme2,
stubnames=['group1', 'group2'],
# i(index)의 경우 리스트로 묶어줘야
i=['State', 'Country', 'Test'],
j='Group_num',
sep='_',
suffix=r'[a-zA-Z]\d+')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,group1,group2
State,Country,Test,Group_num,Unnamed: 4_level_1,Unnamed: 5_level_1
TX,US,Test1,a1,0.45,2
TX,US,Test1,b2,0.3,6
MA,US,Test2,a1,0.03,9
MA,US,Test2,b2,1.2,7
ON,CAN,Test3,a1,0.7,4
ON,CAN,Test3,b2,4.2,2


In [11]:
# stack()과 melt() 메서드의 경우 각각 unstack()과 pivot() 메서드로 되돌리기 가능
# melt()와 pivot()은 재구성할 열을 선택할 수 있는 유연성 제공
# DataFrame을 읽어들일 시 사용할 열을 설정 -> usecols 파라미터에 함수 지정
def usecol_func(name):
    return 'UGDS_' in name or name == 'INSTNM'
# 만들어진 함수 대신 lambda 식으로도 사용 가능
college = pd.read_csv(p1 / 'college.csv', index_col='INSTNM', usecols=usecol_func)
college.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [12]:
# stack() 메서드에 dropna=False 인자 전달하여 결측치 제외 없이 스택 가능 -> stack()은 기본적으로 결측치 삭제
college_stacked = college.stack(dropna=False)
# unstack() 메서드 활용하여 되돌리기
college_stacked.unstack().head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [13]:
# melt() 및 pivot() 메서드를 활용하기 위해 인덱스 미지정(기본 RangeIndex 활용)
college2 = pd.read_csv(p1 / 'college.csv', usecols=usecol_func)
college2.head()

Unnamed: 0,INSTNM,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
2,Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [14]:
college_melted = college2.melt(id_vars='INSTNM', var_name='RACE', value_name='PERCENTAGE')
# melt() 이후 pivot()으로 되돌릴 경우 기관 이름이 인덱스로 이동하며 순서도 원래와 달라지는 문제 -> reindex() 메서드 활용 가능
(college_melted
.pivot(index='INSTNM', columns='RACE', values='PERCENTAGE')
# reindex() 메서드를 통해 원래 인덱스 순서 전달
.reindex(college2['INSTNM'])
# reindex() 메서드에 axis=1 인자 전달하면서 열 순서 전달 -> 'INSTNM'은 인덱스로 빠졌으므로 두 번째 열부터 전달해야
.reindex(college2.columns[1:], axis=1)
.head()
)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [15]:
# groupby 작업은 기본적으로 그룹화 열을 인덱스로 배치 -> unstack() 메서드를 활용해 정리 가능
employee = pd.read_csv(p1 / 'employee.csv')
employee.head()

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22


In [16]:
(employee
# 그룹화 열이 'RACE'와 'GENDER' -> 두 열이 인덱스로 배치
.groupby(['RACE', 'GENDER'])['BASE_SALARY']
.agg('mean')
.astype('int')
# unstack(0) 메서드를 통해 첫 번째 인덱스 레벨을 열로 전환 -> unstack()에 숫자 인자 전달, 기본값은 -1(마지막)
.unstack(0)
# T(transpose) 속성을 통해 전치 가능
.T
)

GENDER,Female,Male
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
American Indian or Alaskan Native,60238,60305
Asian/Pacific Islander,63226,61033
Black or African American,48915,51082
Hispanic/Latino,46503,54782
Others,63785,38771
White,66793,63940


In [17]:
# pivot_table() 메서드를 통해 데이터 분석 가능
flights = pd.read_csv(p1 / 'flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [18]:
fpt = (flights
# index, columns, values, aggfunc 파라미터 기본 사용. fill_value, margins(계) 파라미터 선택 사용
.pivot_table(index='AIRLINE', columns='ORG_AIR', values='CANCELLED', aggfunc='sum', fill_value=0, margins=True)
)
fpt

ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO,All
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,3,4,86,3,3,11,3,35,4,2,154
AS,0,0,0,0,0,0,0,0,0,0,0
B6,0,0,0,0,0,0,0,0,0,1,1
DL,28,1,0,0,1,1,4,0,1,2,38
EV,18,6,27,36,0,0,6,53,0,0,146
F9,0,2,1,0,1,1,1,4,0,0,10
HA,0,0,0,0,0,0,0,0,0,0,0
MQ,5,0,62,0,0,0,0,85,0,0,152
NK,1,1,6,0,1,1,3,10,2,0,25
OO,3,25,2,10,0,15,4,41,9,33,142


In [19]:
fpt2 = (flights
.pivot_table(index=['AIRLINE', 'MONTH'], columns=['ORG_AIR', 'CANCELLED'], values=['DEP_DELAY', 'DIST'], aggfunc='mean', fill_value=0)
# swaplevel() 메서드 활용하여 행 또는 열 레벨의 순서 변경 가능
.swaplevel(0, 2, axis=1)
)
fpt2

Unnamed: 0_level_0,CANCELLED,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
Unnamed: 0_level_1,ORG_AIR,ATL,ATL,DEN,DEN,DFW,DFW,IAH,IAH,LAS,LAS,...,LAX,LAX,MSP,MSP,ORD,ORD,PHX,PHX,SFO,SFO
Unnamed: 0_level_2,Unnamed: 1_level_2,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,...,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST
AIRLINE,MONTH,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
AA,1,-3.250000,0,7.062500,0,11.977591,-3.0,9.750000,0,32.375000,0,...,1678.037037,2475.000000,809.000000,0.0,1068.876033,0.000000,1167.666667,0.0,1860.166667,0.0
AA,2,-3.000000,0,5.461538,0,8.756579,0.0,1.000000,0,-3.055556,0,...,1745.892308,1818.000000,1008.000000,0.0,1193.782178,771.142857,1311.461538,868.0,1337.916667,2586.0
AA,3,-0.166667,0,7.666667,0,15.383784,0.0,10.900000,0,12.074074,0,...,1781.567568,1744.000000,964.733333,0.0,1058.933333,802.000000,1171.363636,0.0,1502.758621,0.0
AA,4,0.071429,0,20.266667,0,10.501493,0.0,6.933333,0,27.241379,0,...,1850.923913,0.000000,648.714286,0.0,1094.633094,943.600000,1266.214286,0.0,1646.903226,0.0
AA,5,5.777778,0,23.466667,0,16.798780,0.0,3.055556,0,2.818182,0,...,1820.478261,0.000000,787.250000,0.0,998.774775,999.500000,1240.444444,0.0,1436.892857,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WN,7,21.700000,0,13.143836,0,0.000000,0.0,0.000000,0,22.439024,0,...,912.453704,327.777778,647.266667,0.0,0.000000,0.000000,799.160256,369.0,636.210526,0.0
WN,8,16.207547,0,7.375000,0,0.000000,0.0,0.000000,0,16.158974,0,...,835.404040,346.000000,508.703704,0.0,0.000000,0.000000,891.569767,0.0,644.857143,392.0
WN,9,8.680672,0,4.378882,0,0.000000,0.0,0.000000,0,7.179487,0,...,830.210000,317.666667,644.416667,0.0,0.000000,0.000000,872.840000,0.0,731.578947,354.5
WN,11,5.932203,0,8.215569,0,0.000000,0.0,0.000000,0,7.522989,0,...,748.404040,459.333333,573.642857,0.0,0.000000,0.000000,823.258741,872.0,580.875000,392.0


In [20]:
# rename_axis() 메서드를 통해 행/열 레벨 이름 변경 가능
college = pd.read_csv(p1 / 'college.csv')

In [21]:
(college
.groupby(['STABBR', 'RELAFFIL'])[['UGDS', 'SATMTMID']]
.agg(['size', 'min', 'max'])
# 열 레벨의 이름 설정 -> 레벨 이름을 없애려면 None으로 설정해야
.rename_axis(['AGG_COLS', 'AGG_FUNCS'], axis=1)
# 열 레벨의 숫자 인덱스 대신 이름으로 stack() 및 unstack(), swaplevel() 메서드 등 가능
.stack('AGG_FUNCS')
.swaplevel('AGG_FUNCS', 'STABBR', axis=0)
# sort_index() 메서드에 열 레벨 이름 활용할 경우 'level 키워드 파라미터' 활용
.sort_index(level='STABBR', axis=0)
# sort_index()에 axis=1 인자를 전달하여 열 순서도 정렬 가능
.sort_index(level='AGG_COLS', axis=1)
.head()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,AGG_COLS,SATMTMID,UGDS
AGG_FUNCS,RELAFFIL,STABBR,Unnamed: 3_level_1,Unnamed: 4_level_1
max,0,AK,,12865.0
max,1,AK,503.0,275.0
min,0,AK,,109.0
min,1,AK,503.0,27.0
size,0,AK,7.0,7.0


In [22]:
# 열 이름에 여러 다른 변수가 포함될 때 str 속성을 사용해 열을 조작할 필요
weightlifting = pd.read_csv(p1 / 'weightlifting_men.csv')
weightlifting

Unnamed: 0,Weight Category,M35 35-39,M40 40-44,M45 45-49,M50 50-54,M55 55-59,M60 60-64,M65 65-69,M70 70-74,M75 75-79,M80 80+
0,56,137,130,125,115,102,92,80,67,62,55
1,62,152,145,137,127,112,102,90,75,67,57
2,69,167,160,150,140,125,112,97,82,75,60
3,77,182,172,165,150,135,122,107,90,82,65
4,85,192,182,175,160,142,130,112,95,87,70
5,94,202,192,182,167,150,137,120,100,90,75
6,105,210,200,190,175,157,142,122,102,95,80
7,105+,217,207,197,182,165,150,127,107,100,85


In [23]:
# melt() 메서드를 사용해 정돈된 데이터의 기초 형식으로 변화
(weightlifting
.melt(id_vars='Weight Category', var_name='Sex Age', value_name='Qual Total')
# 'Sex Age'열을 분리 필요 -> 성별과 나이대로
# assign() 메서드를 통해 먼저 성별을 분리 -> str 속성의 split() 메서드 활용
# ' '로 split을 진행한 뒤 expand=True를 통해 DataFrame의 두 열로 분리 -> 첫 번째 열 선택
# 'M35'형식으로 이루어져있으므로, 맨 앞글자만 가져올 필요 -> str속성의 인덱싱 활용
.assign(Sex=lambda df: df['Sex Age'].str.split(' ', expand=True).iloc[:, 0].str[0])
# ' '로 split을 진행한 뒤 expand=True를 통해 DataFrame의 두 열로 분리 -> 두 번째 열 선택
# assign() 메서드 활용 시 열 이름이 파이썬 변수명에 적절치 않으므로 딕셔너리로 만든 후 딕셔너리 언패킹 이용
# 혹은 str 속성의 split() 메서드 대신 extract() 메서드 이용하여 정규표현식 활용도 가능
.assign(**{'Age Group':lambda df: df['Sex Age'].str.split(' ', expand=True).iloc[:, 1]})
# reindex() 메서드 이용하여 열 순서 정리
.reindex(['Weight Category', 'Sex', 'Age Group', 'Qual Total'], axis=1)
)

Unnamed: 0,Weight Category,Sex,Age Group,Qual Total
0,56,M,35-39,137
1,62,M,35-39,152
2,69,M,35-39,167
3,77,M,35-39,182
4,85,M,35-39,192
...,...,...,...,...
75,77,M,80+,65
76,85,M,80+,70
77,94,M,80+,75
78,105,M,80+,80


In [26]:
inspections = pd.read_csv(p1 / 'restaurant_inspections.csv', parse_dates=['Date'])
inspections.head()

Unnamed: 0,Name,Date,Info,Value
0,E & E Grill House,2017-08-08,Borough,MANHATTAN
1,E & E Grill House,2017-08-08,Cuisine,American
2,E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
3,E & E Grill House,2017-08-08,Grade,A
4,E & E Grill House,2017-08-08,Score,9.0


In [32]:
# pivot() 메서드를 활용할 경우 다단계 인덱스도 가능
(inspections
.pivot(index=['Name', 'Date'], columns='Info', values='Value')
.rename_axis(None, axis=1)
.head()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Borough,Cuisine,Description,Grade,Score
Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


In [33]:
# cities DataFrame의 Geolocation 열을 여러 열로 구분해야 -> str 속성 활용
cities = pd.read_csv(p1 / 'texas_cities.csv')
cities

Unnamed: 0,City,Geolocation
0,Houston,"29.7604° N, 95.3698° W"
1,Dallas,"32.7767° N, 96.7970° W"
2,Austin,"30.2672° N, 97.7431° W"


In [40]:
(cities['Geolocation']
# str 속성에서 split() 메서드의 pat 파라미터 이용하여 정규표현식으로 구분 가능
# pat=r'. ' 형식도 가능
.str.split(pat=r'°|, ', expand=True)
# rename() 메서드에 딕셔너리를 넣어 이름 변경 가능 -> 0, 1, 2, 3의 숫자 순으로 되어있으므로 enumerate() 함수 조합 가능
.rename(dict(enumerate(['latitude', 'latitude direction', 'longitude', 'longitude direction'])), axis=1)
.assign(cities=cities['City'])
# 매 행을 순환하면서 pd.to_numeric()함수를 적용할 필요 없이 apply() 메서드를 전달하는 것으로 가능!!
.apply(pd.to_numeric, errors='ignore')
)

Unnamed: 0,latitude,latitude direction,longitude,longitude direction,cities
0,29.7604,N,95.3698,W,Houston
1,32.7767,N,96.797,W,Dallas
2,30.2672,N,97.7431,W,Austin


In [43]:
(cities['Geolocation']
# str 속성의 extract() 메서드를 활용해서 추출도 가능
# 괄호들(그룹) 뒤에 마침표랑 콤마는 와일드카드(.)와 실제 콤마(,) -> 와일드카드(.) 대신 실제 기호(°) 활용 가능
.str.extract(r'([0-9.]+)° (N|S), ([0-9.]+)° (E|W)', expand=True)
# rename() 메서드에 딕셔너리를 넣어 이름 변경 가능 -> 0, 1, 2, 3의 숫자 순으로 되어있으므로 enumerate() 함수 조합 가능
.rename(dict(enumerate(['latitude', 'latitude direction', 'longitude', 'longitude direction'])), axis=1)
.assign(cities=cities['City'])
# 매 행을 순환하면서 pd.to_numeric()함수를 적용할 필요 없이 apply() 메서드를 전달하는 것으로 가능!!
.apply(pd.to_numeric, errors='ignore')
)

Unnamed: 0,latitude,latitude direction,longitude,longitude direction,cities
0,29.7604,N,95.3698,W,Houston
1,32.7767,N,96.797,W,Dallas
2,30.2672,N,97.7431,W,Austin


In [44]:
sensors = pd.read_csv(p1 / 'sensors.csv')
sensors

Unnamed: 0,Group,Property,2012,2013,2014,2015,2016
0,A,Pressure,928,873,814,973,870
1,A,Temperature,1026,1038,1009,1036,1042
2,A,Flow,819,806,861,882,856
3,B,Pressure,817,877,914,806,942
4,B,Temperature,1008,1041,1009,1002,1013
5,B,Flow,887,899,837,824,873


In [48]:
# 올바르게 배치된 유일한 변수가 'Group' -> 'Group' 열을 기준으로 재정비
(sensors
# melt() 메서드를 활용해 Year와 value를 수직으로 펼치기
.melt(id_vars=['Group', 'Property'], var_name='Year')
# pivot() 메서드를 활용해 'Property'열 값을 열로 배치
.pivot(index=['Group', 'Year'], columns='Property', values='value')
.rename_axis(None, axis=1)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Flow,Pressure,Temperature
Group,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2012,819,928,1026
A,2013,806,873,1038
A,2014,861,814,1009
A,2015,882,973,1036
A,2016,856,870,1042
B,2012,887,817,1008
B,2013,899,877,1041
B,2014,837,914,1009
B,2015,824,806,1002
B,2016,873,942,1013
