# Pandas

In [9]:
import pandas as pd
import numpy as np
#from pandas import Series 를 하게 되면, Series를 pd.Series라 쓰지 않아도 된다.

### 1. data loading

In [2]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' #Data URL
df = pd.read_csv(data_url, sep = "\s+", header = None) #csv 타입 데이터 로드, separate는 빈공간으로 지정하고, Column은 없음

In [3]:
df.head() #처음 다섯 줄 출력

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
df.columns= ['CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO' ,'B', 'LSTAT', 'MEDV'] 
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
type(df.values) #판다스의  dataframe 은 numpy array로 구성

numpy.ndarray

### 3. pandas series

In [15]:
list_data = [1,2,3,4,5]
ex = pd.Series(list_data)
ex

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [17]:
list_name = ["a","b","c","d","e"]
ex = pd.Series(data = list_data, index = list_name)
ex

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [18]:
ex.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [19]:
ex.values

array([1, 2, 3, 4, 5], dtype=int64)

In [21]:
type(ex.values) #series의 value들의 type은 numpy array임을 알 수 있다.

numpy.ndarray

In [23]:
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
ex = pd.Series(dict_data, dtype = np.float32, name = "example_data")
ex

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [24]:
ex["a"] #index로 불러오기

1.0

In [25]:
ex["a"] = 2.2
ex

a    2.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [26]:
ex[ex > 2]

a    2.2
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [27]:
np.exp(ex) #np.abs(), np.log

a      9.025014
b      7.389056
c     20.085537
d     54.598148
e    148.413162
Name: example_data, dtype: float32

In [29]:
"b" in ex

True

In [30]:
ex.to_dict() #다시 dictionary로

{'a': 2.200000047683716, 'b': 2.0, 'c': 3.0, 'd': 4.0, 'e': 5.0}

In [37]:
dict_data_1 = {"a":1, "b":2, "c":3, "d":4, "e":5}
indexes = ["a","b","c","d","e","f","g","h"]
series_obj_1 = pd.Series(dict_data_1, index=indexes)
series_obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64

### 4. pandas dataframe

In [40]:
from pandas import Series, DataFrame

In [41]:
raw_data = {'first_name' : ["Jason","Molly","Tina","Jake","Amy"],
           'last_name' : ["Miller","Jacobson","Ali","Milner","Cooze"],
           'age' : [42,52,36,24,73],
           'city' : ['San Francisco', 'Baltimore', "Miami", 'Douglas', 'Boston']}
df = DataFrame(raw_data)
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [42]:
DataFrame(raw_data, columns = ["age", "city"])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [63]:
df=DataFrame(raw_data, columns = ["age",'city', 'debt']) #새로운 column 추가
df

Unnamed: 0,age,city,debt
0,42,San Francisco,
1,52,Baltimore,
2,36,Miami,
3,24,Douglas,
4,73,Boston,


In [45]:
df.age #column name으로 불러오기

0    42
1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [56]:
df["age"]

0    42
1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [59]:
df["age"].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [52]:
s = pd.Series(range(10), index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s

49    0
48    1
47    2
46    3
45    4
1     5
2     6
3     7
4     8
5     9
dtype: int64

In [53]:
s.loc[2] #loc : index이름으로

6

In [54]:
s.iloc[2] # iloc : index로

2

In [55]:
s.iloc[:3]

49    0
48    1
47    2
dtype: int64

In [60]:
df[df.age>40]

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
4,Amy,Cooze,73,Boston


In [64]:
df.debt = df.age>40
df

Unnamed: 0,age,city,debt
0,42,San Francisco,True
1,52,Baltimore,True
2,36,Miami,False
3,24,Douglas,False
4,73,Boston,True


In [67]:
values = Series(data = ['M','F','F'], index = [0,1,3])
values

0    M
1    F
3    F
dtype: object

In [69]:
df["sex"] = values #index 에 해당되지 않는 값들은 NaN
df

Unnamed: 0,age,city,debt,sex
0,42,San Francisco,True,M
1,52,Baltimore,True,F
2,36,Miami,False,
3,24,Douglas,False,F
4,73,Boston,True,


In [70]:
df.T

Unnamed: 0,0,1,2,3,4
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True
sex,M,F,,F,


In [71]:
df.values

array([[42, 'San Francisco', True, 'M'],
       [52, 'Baltimore', True, 'F'],
       [36, 'Miami', False, nan],
       [24, 'Douglas', False, 'F'],
       [73, 'Boston', True, nan]], dtype=object)

In [72]:
df.to_csv()

',age,city,debt,sex\n0,42,San Francisco,True,M\n1,52,Baltimore,True,F\n2,36,Miami,False,\n3,24,Douglas,False,F\n4,73,Boston,True,\n'

In [73]:
del df["debt"] #바로 제거됨

In [74]:
df

Unnamed: 0,age,city,sex
0,42,San Francisco,M
1,52,Baltimore,F
2,36,Miami,
3,24,Douglas,F
4,73,Boston,


In [77]:
pop = {'Nevada' : {2001 : 2.4, 2002 : 2.9}, 
       'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}}

DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


### 5. data selection

In [80]:
df = pd.read_excel("./excel-comp-data.xlsx")
df.head()

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


In [82]:
df["account"].head(2) #위에 2개

0    211829
1    320563
Name: account, dtype: int64

In [83]:
df[["account",'street','state']].head(3)

Unnamed: 0,account,street,state
0,211829,34456 Sean Highway,Texas
1,320563,1311 Alvis Tunnel,NorthCarolina
2,648336,62184 Schamberger Underpass Apt. 231,Iowa


In [84]:
df[:10]

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,132971,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,145068,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,205217,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,209744,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,212303,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000


In [85]:
df['name'][:3]

0    Kerluke, Koepp and Hilpert
1                Walter-Trantow
2    Bashirian, Kunde and Price
Name: name, dtype: object

In [86]:
account_s = df["account"]
account_s[:3]

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [88]:
account_s[[1,5,2]]

1    320563
5    132971
2    648336
Name: account, dtype: int64

In [89]:
account_s[account_s<250000]

0     211829
3     109996
4     121213
5     132971
6     145068
7     205217
8     209744
9     212303
10    214098
11    231907
12    242368
Name: account, dtype: int64

In [91]:
df.index = df["account"] # index를 account column으로 하겠다
df.head()

Unnamed: 0_level_0,account,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
211829,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
320563,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
648336,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
109996,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
121213,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


In [92]:
del df["account"]
df

Unnamed: 0_level_0,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
132971,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
145068,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
205217,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
209744,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
212303,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000


In [103]:
df.loc[211829]

name           Kerluke, Koepp and Hilpert
street                 34456 Sean Highway
city                           New Jaycob
state                               Texas
postal-code                         28752
Jan                                 10000
Feb                                 62000
Mar                                 35000
Name: 211829, dtype: object

In [104]:
df[df.index == 211829]

Unnamed: 0_level_0,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000


In [105]:
df[['name','street']][:2]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


In [107]:
df.loc[[211829,320563],['name','street']]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


In [108]:
df[['name','street']].iloc[:3]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel
648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231


In [115]:
df.index = list(range(0,15))
df.head()

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


In [116]:
df.drop(1)

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000
10,"Goodwin, Homenick and Jerde",649 Cierra Forks Apt. 078,Rosaberg,Tenessee,47743,45000,120000,55000


In [117]:
df.drop([0,1,2,3])

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000
10,"Goodwin, Homenick and Jerde",649 Cierra Forks Apt. 078,Rosaberg,Tenessee,47743,45000,120000,55000
11,Hahn-Moore,18115 Olivine Throughway,Norbertomouth,NorthDakota,31415,150000,10000,162000
12,"Frami, Anderson and Donnelly",182 Bertie Road,East Davian,Iowa,72686,162000,120000,35000
13,Walsh-Haley,2624 Beatty Parkways,Goodwinmouth,RhodeIsland,31919,55000,120000,35000


In [119]:
df.drop("city", axis=1).head()

Unnamed: 0,name,street,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,California,49681,162000,120000,35000


In [120]:
df #df.drop을 실행해도 df 데이터는 바뀌지 않음

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000


In [121]:
df.drop(0, inplace=True) #inplace =T 조건 시 df 데이터에서 삭제
df.head()

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000


### 6. map, apply, lambda

#### lambda
- 한 줄로 함수를 표현하는 익명 함수 기법.

In [122]:
f = lambda x,y : x+y
f(1,4)

5

In [123]:
f = lambda x : x+5
f(3)

8

In [124]:
(lambda x :x+1)(5)

6

#### map & replace
- map : 함수와 sequence형 데이터를 인자로 받아 각 element마다 입력받은 함수를 적용하여 list로 반환

In [127]:
ex = [1,2,3,4,5]
f = lambda x: x ** 2
list(map(f,ex)) #map앞에 list 반드시

[1, 4, 9, 16, 25]

In [128]:
f = lambda x,y :x+y
list(map(f,ex,ex))

[2, 4, 6, 8, 10]

In [129]:
s1 = Series(np.arange(10))
s1.head()

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [130]:
s1.map(lambda x: x**2).head()

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [131]:
z = {1 : 'A', 5 : 'B', 9 : 'C'}
s1.map(z)

0    NaN
1      A
2    NaN
3    NaN
4    NaN
5      B
6    NaN
7    NaN
8    NaN
9      C
dtype: object

In [133]:
s1 #s1자체는 바뀌지 않음

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [135]:
df = pd.read_csv("./wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [136]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [140]:
df["sex_code"]= df.sex.map({"male" : 0, "female" : 1})
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


#### apply & applymap

In [152]:
df = pd.read_csv("wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [153]:
df_info = df[["earn","height","age"]]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [154]:
f = lambda x : x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

In [156]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [158]:
df_info.sum()

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64