## DataFrame Indexing

In [2]:
import pandas as pd

data = {"names": ["Kim", "Kim", "Kim", "Park", "Park"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one","two","three","four","five"] )
df

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,
two,2015,Kim,1.7,
three,2016,Kim,3.6,
four,2015,Park,2.4,
five,2016,Park,2.9,


### DataFrame에서 열을 선택하고 조작하기

In [4]:
df['points']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
Name: points, dtype: float64

In [6]:
# 동일한 의미를 갖는, 다른 방법
df.names

one       Kim
two       Kim
three     Kim
four     Park
five     Park
Name: names, dtype: object

In [12]:
df[['year','names']]

Unnamed: 0,year,names
one,2014,Kim
two,2015,Kim
three,2016,Kim
four,2015,Park
five,2016,Park


In [13]:
# 특정 열에 대해 이와 같이 선택하고, 우리가 원하는 값을 대입할 수 있다.
df['penalty']=0.5
df

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,0.5
two,2015,Kim,1.7,0.5
three,2016,Kim,3.6,0.5
four,2015,Park,2.4,0.5
five,2016,Park,2.9,0.5


In [22]:
# 또는  python의 List나 numpy의 array로 대입 가능함 
df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5]
# df['penalty'] = [0.1, 0.2, 0.5]  # error
df

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,0.1
two,2015,Kim,1.7,0.2
three,2016,Kim,3.6,0.3
four,2015,Park,2.4,0.4
five,2016,Park,2.9,0.5


In [1]:
import numpy as np

# 새로운 열을 추가하기
df['zeros']=np.arange(5)

df

NameError: name 'df' is not defined

In [18]:
# Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7], index=['two','four','five'])
df['dfg'] = val
df

Unnamed: 0,year,names,points,penalty,zeros,debt,dfg
one,2014,Kim,1.5,0.5,0,,
two,2015,Kim,1.7,0.5,1,-1.2,-1.2
three,2016,Kim,3.6,0.5,2,,
four,2015,Park,2.4,0.5,3,-1.5,-1.5
five,2016,Park,2.9,0.5,4,-1.7,-1.7


* 하지만 Series로 넣을 때는 val와 같이 넣으려는 data의 index에 맞춰서 데이터가 들어간다.
* 이점이 python list나 numpy array로 데이터를 넣을때와 가장 큰 차이점이다.

In [22]:
# 열 값의 연산 결과로 새로운 열을 추가하기
df['net_points'] = df['points'] - df['penalty']
df['high_points'] = df.net_points > 2.3
df

Unnamed: 0,year,names,points,penalty,zeros,debt,dfg,net_points,high_points
one,2014,Kim,1.5,0.5,0,,,1.0,False
two,2015,Kim,1.7,0.5,1,-1.2,-1.2,1.2,False
three,2016,Kim,3.6,0.5,2,,,3.1,True
four,2015,Park,2.4,0.5,3,-1.5,-1.5,1.9,False
five,2016,Park,2.9,0.5,4,-1.7,-1.7,2.4,True


In [23]:
# 열 삭제하기
del df['debt']


df

Unnamed: 0,year,names,points,penalty,zeros,dfg,net_points,high_points
one,2014,Kim,1.5,0.5,0,,1.0,False
two,2015,Kim,1.7,0.5,1,-1.2,1.2,False
three,2016,Kim,3.6,0.5,2,,3.1,True
four,2015,Park,2.4,0.5,3,-1.5,1.9,False
five,2016,Park,2.9,0.5,4,-1.7,2.4,True


In [27]:
# index와 컬럼 타이틀 지정 
df.columns
df.index.name = 'Order'
df.columns.name = 'Info'
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,Kim,1.5,0.1,
two,2015,Kim,1.7,0.2,-1.2
three,2016,Kim,3.6,0.3,
four,2015,Park,2.4,0.4,-1.5
five,2016,Park,2.9,0.5,-1.7


## DataFrame에서 행을 선택하고 조작하기
* pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히 많다.
* 물론 위에서 소개했던 열을 선택하는 방법도 수많은 방법중에 하나에 불과하다.

In [4]:
# 0번째 부터 2(3-1) 번째까지 행을 가져온다.
# 뒤에 써준 숫자번째의 행은 뺀다.
df[0:3]

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,
two,2015,Kim,1.7,
three,2016,Kim,3.6,


In [29]:
# tow라는 행부터 four라는 행까지 가져온다.
# 뒤에 써준 이름의 행을 빼지 않는다.
df['two':'four'] # 하지만 비추천!

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,Kim,1.7,0.2,-1.2
three,2016,Kim,3.6,0.3,
four,2015,Park,2.4,0.4,-1.5


### df.loc 사용

In [5]:
# 아래 방법을 권장한다. 
# .loc 또는 .iloc 함수를 사용하는 방법.
df.loc['two'] # 반환 형태는 Series

year       2015
names       Kim
points      1.7
penalty     NaN
Name: two, dtype: object

In [8]:
df[0:1] #ok
df[0] #error

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,


In [12]:
df.loc['two':'four', 'points']

two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [9]:
df.iloc[1:3]

Unnamed: 0,year,names,points,penalty
two,2015,Kim,1.7,
three,2016,Kim,3.6,


In [34]:
df.loc[....]

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [13]:
df.loc[:,'year'] # == df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [14]:
df.loc[:,['year','names']]

Unnamed: 0,year,names
one,2014,Kim
two,2015,Kim
three,2016,Kim
four,2015,Park
five,2016,Park


In [37]:
df.loc['three':'five','year':'penalty']

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2016,Kim,3.6,0.3
four,2015,Park,2.4,0.4
five,2016,Park,2.9,0.5


In [16]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,
two,2015,Kim,1.7,
three,2016,Kim,3.6,
four,2015,Park,2.4,
five,2016,Park,2.9,


In [17]:
# 새로운 행 삽입하기
df.loc['six'] = [2013,'Jun',4.0,0.1]
df

Unnamed: 0,year,names,points,penalty
one,2014,Kim,1.5,
two,2015,Kim,1.7,
three,2016,Kim,3.6,
four,2015,Park,2.4,
five,2016,Park,2.9,
six,2013,Jun,4.0,0.1


### df.iloc 사용

In [18]:
# .iloc 사용:: index 번호를 사용한다.
df.iloc[3] # 3번째 행을 가져온다.

year       2015
names      Park
points      2.4
penalty     NaN
Name: four, dtype: object

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,year,names
four,2015,Park
five,2016,Park


In [26]:
df.iloc[[0,1,3],[1,3]]

Unnamed: 0,names,penalty
one,Kim,
two,Kim,
four,Park,


In [43]:
df.iloc[.....]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,Kim,1.5,0.1
two,Kim,1.7,0.2
three,Kim,3.6,0.3
four,Park,2.4,0.4
five,Park,2.9,0.5
six,Jun,4.0,0.1


In [None]:
df.iloc[3,1]

## DataFrame에서의 Boolean Indexing

In [46]:
import pandas as pd

data = {"names": ["Kim", "Kim", "Kim", "Park", "Park"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one", "two", "three", "four", "five"])

df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5]
val = pd.Series([-1.2, -1.5, -1.7], index=['two','four','five'])
df['debt'] = val
df

Unnamed: 0,year,names,points,penalty,debt
one,2014,Kim,1.5,0.1,
two,2015,Kim,1.7,0.2,-1.2
three,2016,Kim,3.6,0.3,
four,2015,Park,2.4,0.4,-1.5
five,2016,Park,2.9,0.5,-1.7


In [47]:
# year가 2014보다 큰 boolean data
df['year'] > 2014

one      False
two       True
three     True
four      True
five      True
Name: year, dtype: bool

In [28]:
# year가 2014보다 큰 모든 행의 값
df.loc[df['year'] > 2014]

Unnamed: 0,year,names,points,penalty
two,2015,Kim,1.7,
three,2016,Kim,3.6,
four,2015,Park,2.4,
five,2016,Park,2.9,


In [30]:
df.loc[df['names']=='Kim',['names','points']]

Unnamed: 0,names,points
one,Kim,1.5
two,Kim,1.7
three,Kim,3.6


In [50]:
# numpy에서와 같이 논리연산을 응용할 수 있다.
df.loc[(df['points']>2)&(df['points']<3),:]

Unnamed: 0,year,names,points,penalty,debt
four,2015,Park,2.4,0.4,-1.5
five,2016,Park,2.9,0.5,-1.7


In [52]:
# 새로운 값을 대입할 수도 있다.
df.loc[df['points'] > 3, 'penalty'] = 0
df

Unnamed: 0,year,names,points,penalty,debt
one,2014,Kim,1.5,0.1,
two,2015,Kim,1.7,0.2,-1.2
three,2016,Kim,3.6,0.0,
four,2015,Park,2.4,0.4,-1.5
five,2016,Park,2.9,0.5,-1.7
