##  melt 메서드

* 1개의 열만 고정하고 나머지 열을 행으로 바꾸기

In [85]:
# 리서치 센터(Pew Research Center) : 미국의 소득과 종교
import pandas as pd
pew = pd.read_csv('../data/pew.csv')
print(pew.head())

             religion  <$10k  $10-20k  $20-30k  $30-40k  $40-50k  $50-75k  \
0            Agnostic     27       34       60       81       76      137   
1             Atheist     12       27       37       52       35       70   
2            Buddhist     27       21       30       34       33       58   
3            Catholic    418      617      732      670      638     1116   
4  Don’t know/refused     15       14       15       11       10       35   

   $75-100k  $100-150k  >150k  Don't know/refused  
0       122        109     84                  96  
1        73         59     74                  76  
2        62         39     53                  54  
3       949        792    633                1489  
4        21         17     18                 116  


* melt 메서드 인자
* id_vars : 위치를 그대로 유지할 열의 이름을 지정
* value_vars : 행으로 위치를 변경할 열의 이름을 지정
* var_name : value_vars로 위치를 변경할 열의 이름을 지정
* value_name : var_name으로 위치를 변경할 열의 데이터를 저장할 열의 이름을 지정

In [86]:
pew.iloc[:,0:6]

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k
0,Agnostic,27,34,60,81,76
1,Atheist,12,27,37,52,35
2,Buddhist,27,21,30,34,33
3,Catholic,418,617,732,670,638
4,Don’t know/refused,15,14,15,11,10
5,Evangelical Prot,575,869,1064,982,881
6,Hindu,1,9,7,9,11
7,Historically Black Prot,228,244,236,238,197
8,Jehovah's Witness,20,27,24,24,21
9,Jewish,19,19,25,25,30


In [87]:
#religion열을 고정으로 피벗, 위치를 그대로 유지할 열의 이름
pew_long = pd.melt(pew, id_vars = 'religion')
pew_long.head()

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


In [88]:
# var_name : 위치를 변경한 열 이름
pew_long = pd.melt(pew, id_vars = 'religion', var_name = 'income', value_name = 'count')
pew_long.head()

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


## 2개 이상의 열을 고정하고 나머지 열을 행으로 바꾸기

In [89]:
billboard = pd.read_csv('../data/billboard.csv')

billboard.iloc[0:5, 0:16]

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,wk6,wk7,wk8,wk9,wk10,wk11
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,94.0,99.0,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,57.0,54.0,53.0,51.0,51.0,51.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,65.0,55.0,59.0,62.0,61.0,61.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,31.0,36.0,49.0,53.0,57.0,64.0


In [90]:
# year,artist,track,time,date.entered 열을 고정하고 나머지열(w1, w2...)을 피벗
billboard_long = pd.melt(billboard, id_vars = ['year', 'artist', 'track', 'time',
                                              'date.entered'], var_name = 'week', value_name = 'rating')
billboard_long[billboard_long['artist'] == '2 Pac'].head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
317,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk2,82.0
634,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk3,72.0
951,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk4,77.0
1268,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk5,87.0


## ebola 데이터 집합 살펴보기

In [91]:
ebola = pd.read_csv('../data/country_timeseries.csv')
ebola.columns

Index(['Date', 'Day', 'Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone',
       'Cases_Nigeria', 'Cases_Senegal', 'Cases_UnitedStates', 'Cases_Spain',
       'Cases_Mali', 'Deaths_Guinea', 'Deaths_Liberia', 'Deaths_SierraLeone',
       'Deaths_Nigeria', 'Deaths_Senegal', 'Deaths_UnitedStates',
       'Deaths_Spain', 'Deaths_Mali'],
      dtype='object')

In [92]:
ebola.iloc[:5, [0,1,2,3,10,11]]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Deaths_Guinea,Deaths_Liberia
0,1/5/2015,289,2776.0,,1786.0,
1,1/4/2015,288,2775.0,,1781.0,
2,1/3/2015,287,2769.0,8166.0,1767.0,3496.0
3,1/2/2015,286,,8157.0,,3496.0
4,12/31/2014,284,2730.0,8115.0,1739.0,3471.0


In [93]:
# Date, Day 기준으로 피벗 ( 각 나라별 사망자 수를 확인)
ebola_long = pd.melt(ebola, id_vars = ['Date', 'Day'])
ebola_long.head()

Unnamed: 0,Date,Day,variable,value
0,1/5/2015,289,Cases_Guinea,2776.0
1,1/4/2015,288,Cases_Guinea,2775.0
2,1/3/2015,287,Cases_Guinea,2769.0
3,1/2/2015,286,Cases_Guinea,
4,12/31/2014,284,Cases_Guinea,2730.0


In [94]:
variable_split = ebola_long.variable.str.split('_')
#variable 컬럼의 데이터를 '_' 기준으로 스플릿하기 
variable_split.head()

0    [Cases, Guinea]
1    [Cases, Guinea]
2    [Cases, Guinea]
3    [Cases, Guinea]
4    [Cases, Guinea]
Name: variable, dtype: object

In [95]:
status_values = variable_split.str.get(0)
country_values = variable_split.str.get(1)
#스플릿된 각각의 스트링을 저장. 
print(status_values[:5])

0    Cases
1    Cases
2    Cases
3    Cases
4    Cases
Name: variable, dtype: object


In [96]:
country_values[:5]

0    Guinea
1    Guinea
2    Guinea
3    Guinea
4    Guinea
Name: variable, dtype: object

In [97]:
ebola_long['status'] = status_values
ebola_long['country'] = country_values
ebola_long.head()

Unnamed: 0,Date,Day,variable,value,status,country
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea


In [98]:
variable_split = ebola_long.variable.str.split('_', expand = True)
variable_split.columns = ['status', 'country']
ebola_parsed = pd.concat([ebola_long, variable_split], axis = 1)
ebola_parsed.head()

Unnamed: 0,Date,Day,variable,value,status,country,status.1,country.1
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea,Cases,Guinea


## 기상 데이터의 여러 열을 하나로 정리하기 - melt, pivot_table 메서드

In [99]:
weather = pd.read_csv('../data/weather.csv')
print(weather.iloc[:5, :11])

        id  year  month element  d1    d2    d3  d4    d5  d6  d7
0  MX17004  2010      1    tmax NaN   NaN   NaN NaN   NaN NaN NaN
1  MX17004  2010      1    tmin NaN   NaN   NaN NaN   NaN NaN NaN
2  MX17004  2010      2    tmax NaN  27.3  24.1 NaN   NaN NaN NaN
3  MX17004  2010      2    tmin NaN  14.4  14.4 NaN   NaN NaN NaN
4  MX17004  2010      3    tmax NaN   NaN   NaN NaN  32.1 NaN NaN


In [100]:
weather_melt = pd.melt(weather, id_vars = ['id', 'year', 'month', 'element'],
                      var_name = 'day', value_name = 'temp')
print(weather_melt.head())

        id  year  month element day  temp
0  MX17004  2010      1    tmax  d1   NaN
1  MX17004  2010      1    tmin  d1   NaN
2  MX17004  2010      2    tmax  d1   NaN
3  MX17004  2010      2    tmin  d1   NaN
4  MX17004  2010      3    tmax  d1   NaN


In [101]:
# index : 위치를 그대로 유지할 열 이름을 지정
weather_tidy = weather_melt.pivot_table(
    index = ['id', 'year', 'month', 'day'],
    columns = 'element',
    values = 'temp')

print(weather_tidy)

element                 tmax  tmin
id      year month day            
MX17004 2010 1     d30  27.8  14.5
             2     d11  29.7  13.4
                   d2   27.3  14.4
                   d23  29.9  10.7
                   d3   24.1  14.4
             3     d10  34.5  16.8
                   d16  31.1  17.6
                   d5   32.1  14.2
             4     d27  36.3  16.7
             5     d27  33.2  18.2
             6     d17  28.0  17.5
                   d29  30.1  18.0
             7     d3   28.6  17.5
                   d14  29.9  16.5
             8     d23  26.4  15.0
                   d5   29.6  15.8
                   d29  28.0  15.3
                   d13  29.8  16.5
                   d25  29.7  15.6
                   d31  25.4  15.4
                   d8   29.0  17.3
             10    d5   27.0  14.0
                   d14  29.5  13.0
                   d15  28.7  10.5
                   d28  31.2  15.0
                   d7   28.1  12.9
             11    d

In [102]:
weather_tidy_flat = weather_tidy.reset_index()
print(weather_tidy_flat.head())

element       id  year  month  day  tmax  tmin
0        MX17004  2010      1  d30  27.8  14.5
1        MX17004  2010      2  d11  29.7  13.4
2        MX17004  2010      2   d2  27.3  14.4
3        MX17004  2010      2  d23  29.9  10.7
4        MX17004  2010      2   d3  24.1  14.4


## 빌보드 차트의 중복 데이터 처리하기

In [103]:
billboard = pd.read_csv('../data/billboard.csv')

billboard_long = pd.melt(billboard, id_vars = ['year', 'artist', 'track', 'time', 'date.entered'], var_name = 'week', value_name = 'rating')

billboard_long.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0


In [104]:
print(billboard_long[billboard_long.track == 'Loser'].head())

      year        artist  track  time date.entered week  rating
3     2000  3 Doors Down  Loser  4:24   2000-10-21  wk1    76.0
320   2000  3 Doors Down  Loser  4:24   2000-10-21  wk2    76.0
637   2000  3 Doors Down  Loser  4:24   2000-10-21  wk3    72.0
954   2000  3 Doors Down  Loser  4:24   2000-10-21  wk4    69.0
1271  2000  3 Doors Down  Loser  4:24   2000-10-21  wk5    67.0


In [105]:
billboard_songs = billboard_long[['year', 'artist', 'track', 'time']]
print(billboard_songs.shape)

(24092, 4)


In [106]:
billboard_songs = billboard_songs.drop_duplicates()
print(billboard_songs.shape)

(317, 4)


In [107]:
print(billboard_songs[billboard_songs.track == 'Loser'])

   year        artist  track  time
3  2000  3 Doors Down  Loser  4:24


In [108]:
billboard_ratings = billboard_long.merge(billboard_songs, on = ['year', 'artist', 'track', 'time'])
print(billboard_ratings.shape)

(24092, 7)


In [109]:
print(billboard_ratings.head())

   year artist                    track  time date.entered week  rating
0  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk1    87.0
1  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk2    82.0
2  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk3    72.0
3  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk4    77.0
4  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk5    87.0


## 뉴욕 택시 데이터

In [110]:
# fhv_tripdata_2015-01, 02,03,04,05
import glob
nyc_taxi_data = glob.glob('../data/fhv_*')
print(nyc_taxi_data)

['../data\\fhv_tripdata_2015-01.csv', '../data\\fhv_tripdata_2015-02.csv', '../data\\fhv_tripdata_2015-03.csv', '../data\\fhv_tripdata_2015-04.csv', '../data\\fhv_tripdata_2015-05.csv']


In [111]:
taxi1 = pd.read_csv(nyc_taxi_data[0])
taxi2 = pd.read_csv(nyc_taxi_data[1])
taxi3 = pd.read_csv(nyc_taxi_data[2])
taxi4 = pd.read_csv(nyc_taxi_data[3])
taxi5 = pd.read_csv(nyc_taxi_data[4])


In [112]:
print(taxi1.head(2))
print(taxi2.head(2))

  Dispatching_base_num          Pickup_date  locationID
0               B00013  2015-01-01 00:30:00         NaN
1               B00013  2015-01-01 01:22:00         NaN
  Dispatching_base_num          Pickup_date  locationID
0               B00013  2015-02-01 00:00:00         NaN
1               B00013  2015-02-01 00:01:00         NaN


In [113]:
print(type(taxi1))
print(taxi1.shape)
print(taxi2.shape)
print(taxi3.shape)
print(taxi4.shape)
print(taxi5.shape)


<class 'pandas.core.frame.DataFrame'>
(2746033, 3)
(3126401, 3)
(3281427, 3)
(3917789, 3)
(4296067, 3)


In [114]:
taxi = pd.concat([taxi1, taxi2, taxi3, taxi4, taxi5])
print(taxi.shape)

(17367717, 3)


In [115]:
list_taxi_df = []

for csv_filename in nyc_taxi_data:
    print(csv_filename)
    df = pd.read_csv(csv_filename)
    list_taxi_df.append(df)
    
print(len(list_taxi_df))    

../data\fhv_tripdata_2015-01.csv
../data\fhv_tripdata_2015-02.csv
../data\fhv_tripdata_2015-03.csv
../data\fhv_tripdata_2015-04.csv
../data\fhv_tripdata_2015-05.csv
5


In [116]:
taxi_loop_concat = pd.concat(list_taxi_df)
print(taxi_loop_concat.shape)

(17367717, 3)


In [117]:
print(taxi.equals(taxi_loop_concat))

True


In [118]:
import openpyxl
a = pd.read_excel('../data/02. sales-funnel.xlsx')

In [119]:
a.pivot_table(index = 'Name', values = ['Account', 'Price', 'Quantity'])

Unnamed: 0_level_0,Account,Price,Quantity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Barton LLC,740150,35000,1.0
"Fritsch, Russel and Anderson",737550,35000,1.0
Herman LLC,141962,65000,2.0
Jerde-Hilpert,412290,5000,2.0
"Kassulke, Ondricka and Metz",307599,7000,3.0
Keeling LLC,688981,100000,5.0
Kiehn-Spinka,146832,65000,2.0
Koepp Ltd,729833,35000,2.0
Kulas Inc,218895,25000,1.5
Purdy-Kunde,163416,30000,1.0


In [120]:
a.pivot_table(index = ['Name','Rep', 'Manager'], values = ['Account', 'Price', 'Quantity'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Account,Price,Quantity
Name,Rep,Manager,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Barton LLC,John Smith,Debra Henley,740150,35000,1.0
"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,737550,35000,1.0
Herman LLC,Cedric Moss,Fred Anderson,141962,65000,2.0
Jerde-Hilpert,John Smith,Debra Henley,412290,5000,2.0
"Kassulke, Ondricka and Metz",Wendy Yule,Fred Anderson,307599,7000,3.0
Keeling LLC,Wendy Yule,Fred Anderson,688981,100000,5.0
Kiehn-Spinka,Daniel Hilton,Debra Henley,146832,65000,2.0
Koepp Ltd,Wendy Yule,Fred Anderson,729833,35000,2.0
Kulas Inc,Daniel Hilton,Debra Henley,218895,25000,1.5
Purdy-Kunde,Cedric Moss,Fred Anderson,163416,30000,1.0


In [121]:
a.pivot_table(index = ['Name','Rep'], values = 'Price')

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Name,Rep,Unnamed: 2_level_1
Barton LLC,John Smith,35000
"Fritsch, Russel and Anderson",Craig Booker,35000
Herman LLC,Cedric Moss,65000
Jerde-Hilpert,John Smith,5000
"Kassulke, Ondricka and Metz",Wendy Yule,7000
Keeling LLC,Wendy Yule,100000
Kiehn-Spinka,Daniel Hilton,65000
Koepp Ltd,Wendy Yule,35000
Kulas Inc,Daniel Hilton,25000
Purdy-Kunde,Cedric Moss,30000


# 데이터 저장, 읽기

In [128]:
import pandas as pd 
practice = pd.DataFrame({'날짜':[],'운동':[], '양':[]})
practice

Unnamed: 0,날짜,운동,양


In [129]:
practice.loc[0] = ['19-3-1', '달리기', 1.]

In [130]:
practice.loc[1] = ['19-3-2', '걷기', 1.]
practice.loc[2] = ['19-3-2', '달리기', 1.]
practice.loc[3] = ['19-3-2', '계단오르기', 1.]
practice

Unnamed: 0,날짜,운동,양
0,19-3-1,달리기,1.0
1,19-3-2,걷기,1.0
2,19-3-2,달리기,1.0
3,19-3-2,계단오르기,1.0


In [131]:
practice.pivot_table(index = '운동', values=['날짜', '양'])

Unnamed: 0_level_0,양
운동,Unnamed: 1_level_1
걷기,1.0
계단오르기,1.0
달리기,1.0


In [134]:
practice.pivot('날짜', '운동', '양')

운동,걷기,계단오르기,달리기
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19-3-1,,,1.0
19-3-2,1.0,1.0,1.0


In [136]:
df = pd.read_excel('../data/02. sales-funnel.xlsx')
df.head()

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won


In [138]:
pd.pivot_table(df, index = ['Name', 'Rep', 'Manager'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Account,Price,Quantity
Name,Rep,Manager,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Barton LLC,John Smith,Debra Henley,740150,35000,1.0
"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,737550,35000,1.0
Herman LLC,Cedric Moss,Fred Anderson,141962,65000,2.0
Jerde-Hilpert,John Smith,Debra Henley,412290,5000,2.0
"Kassulke, Ondricka and Metz",Wendy Yule,Fred Anderson,307599,7000,3.0
Keeling LLC,Wendy Yule,Fred Anderson,688981,100000,5.0
Kiehn-Spinka,Daniel Hilton,Debra Henley,146832,65000,2.0
Koepp Ltd,Wendy Yule,Fred Anderson,729833,35000,2.0
Kulas Inc,Daniel Hilton,Debra Henley,218895,25000,1.5
Purdy-Kunde,Cedric Moss,Fred Anderson,163416,30000,1.0


In [139]:
pd.pivot_table(df, index = ['Name'])
#인덱싱을 하고 난 다음, 열로 들어오는 컬럼들은 넘버링 데이터가 들어오게 된다. -> 'Rep', Manager같은 경우는 들어오지 않음

Unnamed: 0_level_0,Account,Price,Quantity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Barton LLC,740150,35000,1.0
"Fritsch, Russel and Anderson",737550,35000,1.0
Herman LLC,141962,65000,2.0
Jerde-Hilpert,412290,5000,2.0
"Kassulke, Ondricka and Metz",307599,7000,3.0
Keeling LLC,688981,100000,5.0
Kiehn-Spinka,146832,65000,2.0
Koepp Ltd,729833,35000,2.0
Kulas Inc,218895,25000,1.5
Purdy-Kunde,163416,30000,1.0


# 여러가지 자료형을 문자열로 변환하기

## 자료형 자유자재로 변환하기 -astype 메서드

In [151]:
import seaborn as sns
tips = sns.load_dataset('tips')

In [152]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [153]:
tips['sex_str'] = tips['sex'].astype(str)
#'sex'데이터를 스트링 형으로 'sex_str'에 넣어준다.

print(tips.dtypes)
#오브젝트 형은 파이썬의 케릭터 형을 뜻한다. 
tips.head()

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,10.34,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,23.68,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female


In [154]:
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes
#'total_bill'의 데이터를 str형태로 변환

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [155]:
tips['total_bill'] = tips['total_bill'].astype(float)
tips.dtypes
#'total_bill'의 데이터를 float형태로 변환

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

## 잘못 입력한 문자열 처리하기 -to_numeric 메서드, error 옵션

In [156]:
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = 'missing'
tips_sub_miss

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,missing,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,missing,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female
5,missing,4.71,Male,No,Sun,Dinner,4,Male
6,8.77,2.0,Male,No,Sun,Dinner,2,Male
7,missing,3.12,Male,No,Sun,Dinner,4,Male
8,15.04,1.96,Male,No,Sun,Dinner,2,Male
9,14.78,3.23,Male,No,Sun,Dinner,2,Male


In [158]:
tips_sub_miss.dtypes
#'total_bill'의 특정 행에 missing 데이터를 입력했기 때문에, 칼럼의 dtypes 이 str형태로 변경되었다.

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [159]:
tips_sub_miss['total_bill'].astype(float)
# 스트링 형테의 데이터가 있는 컬럼을 자료형변환 하게되면 에러코드가 발생한다.

ValueError: could not convert string to float: 'missing'

In [160]:
pd.to_numeric(tips_sub_miss['total_bill'])
#마찬가지로, pandas 모듈중 하나인 to_numeric으로, str형의 데이터가 있는 컬럼의 dtype을 변경하려 해도 에러가 발생한드아

ValueError: Unable to parse string "missing" at position 1

In [162]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')
# 오류를 무시하는 옵션인 error = 'ignore'을 넣어주면 에러코드 발생은 안되지만 자료형변환은 이뤄지지 않는다.
print(tips_sub_miss.dtypes)


total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [164]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')
# 강제로 형변환을 하기 위해선, errors='coerce'옵션을 넣어준다. 자료 형변환은 이뤄지며, 변환될 타입에 맞지않은 자리에는 NaN이 들어간다. 
print(tips_sub_miss.dtypes)
tips_sub_miss.head()


total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female


In [166]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce', downcast='float')
# downcast='float' 옵션은 float형을 더 작게 형변환을 하겠단 의미. 64 -> 32
print(tips_sub_miss.dtypes)


total_bill     float32
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
