# Pandas :
인덱스를 가진 자료형의 데이터프레임을 파이썬에 구현함
- series : 시계열
- Dataframe : 복수필드 시계열 및 테이블 데이터
- index : label and name

# Series:
명시적인 index를 가지지 않는 series

In [10]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([4,7,-5,3])
print s

0    4
1    7
2   -5
3    3
dtype: int64


In [5]:
print s.values
print type(s.values)
print s.index
print type(s.index)

[ 4  7 -5  3]
<type 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1)
<class 'pandas.indexes.range.RangeIndex'>


- Vectorized Operation

In [11]:
print s*2
print "="*50
print np.exp(s)

0     8
1    14
2   -10
3     6
dtype: int64
0      54.598150
1    1096.633158
2       0.006738
3      20.085537
dtype: float64


# 명시적인 Index를 가지는 Series
- Index 인수로 인덱스 설정
- Index 원소는 데이터들에 대한 key 역할을 하는 Label이다
- 딕셔너리

In [12]:
s2 = pd.Series([4,7,-5,3],index=["d","b","a","c"])
print s2

d    4
b    7
a   -5
c    3
dtype: int64


In [13]:
s2.index

Index([u'd', u'b', u'a', u'c'], dtype='object')

# Series Indexing 1 : Label Indexing
- Single Label
- Label Slicing (마지막 원소포함)
- Label을 원소를 가지는 Label(List Fancy Indexing) - 주어진 순서대로 재배열

In [17]:
print s2['a']
print "="*50

print s2["b":"c"]
print "="*50

print s2[["a","b"]]

-5
b    7
a   -5
c    3
dtype: int64
a   -5
b    7
dtype: int64


# Series Indexing 2 : Integer Indexing
- single integer
- Integer Slicing (마지막 원소 포함 안함)
- Integer list indexning (list fancy indexing)
- Boolearn fancy indexing

In [21]:
print s2[2]
print "="*50

print s2[1:4]
print "="*50

print s2[[2,1]]
print "="*50

print s2[s2>0]

-5
b    7
a   -5
c    3
dtype: int64
a   -5
b    7
dtype: int64
d    4
b    7
c    3
dtype: int64


# dict 연산

In [22]:
"a" in s2, "e" in s2

(True, False)

In [24]:
for k, v in s2.iteritems():
    print (k,v)

('d', 4)
('b', 7)
('a', -5)
('c', 3)


In [26]:
print s2["d":"c"]

d    4
b    7
a   -5
c    3
dtype: int64


# dict 데이터를 이용한 Series 생성

In [28]:
# dictionary
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah":5000}
s3 = pd.Series(sdata)
print s3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64


In [31]:
states = ['California','Ohio','Oregon','Texas']
s4 = pd.Series(sdata, index=states)
print s4
# California는 현재 위 인덱스에 없음

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [33]:
print pd.isnull(s4)
print pd.notnull(s4)
print s4.isnull()
print s4.notnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


# Index 기준 연산

In [37]:
print (s3.values, s4.values)
print s3.values+s4.values # 값 단순합산(인덱스기준)

(array([35000, 16000, 71000,  5000], dtype=int64), array([    nan,  35000.,  16000.,  71000.]))
[    nan  51000.  87000.  76000.]


In [36]:
s3+s4 

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

# Index 이름

In [39]:
s4.name = "population"
print s4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [40]:
s4.index.name="state"
print s4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


# Index 변경

In [44]:
s.index # 위에 연습


RangeIndex(start=0, stop=4, step=1)

In [47]:
s.index = ["Bob","Steve","Jeff","Ryan"]
print s.index
print s

Index([u'Bob', u'Steve', u'Jeff', u'Ryan'], dtype='object')
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


- 연습 문제 1
   + 1. 명시적 Index가 없는 Series
   + 2. 명시적 Index가 있는 Series

In [55]:
practice = pd.Series([2,10,5,3])
print practice

print "="*50

practice1 = pd.Series([1,2,3,4],index = ["you","are","so","beautiful"])
print practice1

0     2
1    10
2     5
3     3
dtype: int64
you          1
are          2
so           3
beautiful    4
dtype: int64


# DataFrame
- Multi-Series
- 2차원 행렬
- Numpy Array와의 차이점

In [60]:
data = {
    'state' : ["Ohio","Ohio","Ohio","Nevada","Nevada"],
    'year' : [2000,2001,2002,2001,2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}
df = pd.DataFrame(data)
print df

   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002


In [65]:
pd.DataFrame(data, columns=["year","state","pop"]) # 표로 나타내줌


pop      float64
state     object
year       int64
dtype: object

In [66]:
df.dtypes

pop      float64
state     object
year       int64
dtype: object

# 명시적인 Coumn/Row Index를 가지는 DataFrame

In [74]:
df2 = pd.DataFrame(data,
                 columns=['year','state','pop','debt'],
                 index=['one','two','three','four','five'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


# Single Column Access

In [75]:
df["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [69]:
print type(df["state"])
print df.state

<class 'pandas.core.series.Series'>
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object


# Column Data Update

In [76]:
df2['debt'] = 16.5
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [77]:
df2['debt'] = pd.Series([-1.2, -1.5, -1.7],index=['two','four','five'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


# Add Column

In [83]:
df2['eastern'] = df2.state == "Ohio"
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


# Delete Column 

In [84]:
del df2['eastern']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


# inplace option
- 함수/메소드는 두 가지 종류
    - inplace=True -> 출력 None, 객체 자체 변경
    - inplace=False -> 변형된 새로운 객체 출력, 객체 자체 보존

In [91]:
x = [3,6,1,4]
print sorted(x) # False
print x.sort() # True

[1, 3, 4, 6]
None


# Drop 메소드를 이용한 Row/Column 삭제
- del 함수
   - inplace 연산
- drop 메소드
   - 삭제된 시리즈/데이터프레임 출력
   - 시리즈는 행 삭제
   - 데이터프레임은 axis 인수로 행/렬 선택(0:행 / 1 : 열)

In [92]:
s = pd.Series(np.arange(5.),index = ['a','b','c','d','e'])
s

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [93]:
s2 = s.drop('c')
s2

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [95]:
s # s는 그대로 존재

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [96]:
s.drop(["b",'c'])

a    0.0
d    3.0
e    4.0
dtype: float64

In [99]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                index = ['Ohio','Colorado','Utah','New York'],
                columns = ['one','two','three','four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [101]:
df.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [103]:
df.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [104]:
df.drop(['two','four'],axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


# Nested dict를 사용한 DataFrame 생성

In [106]:
pop = {
    'Nevada':{
        2001:2.4,
        2002:2.9
    },
    'Ohio':{
        2000:1.5,
        2001:1.7,
        2002:3.6
    }
}

In [108]:
# 2000년의 네바다는 없음
df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


#Series dict를 사용한 DataFrame 생성

In [113]:
pdata = {
    'Ohio':df3["Ohio"][:-1],
    'Nevada':df3['Nevada'][:2]
}
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


# NumPy array로 변환


In [114]:
df3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [115]:
df2.values

array([[2000L, 'Ohio', 1.5, nan],
       [2001L, 'Ohio', 1.7, -1.2],
       [2002L, 'Ohio', 3.6, nan],
       [2001L, 'Nevada', 2.4, -1.5],
       [2002L, 'Nevada', 2.9, -1.7]], dtype=object)

# DataFrame의 Column Indexing
- Single Label(작은 범주들의 변수명 이름) key
- Slngle Label Attribute
- Label List Fancy Indexing

In [117]:
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [121]:
# df2에서 year 네임만 추출
print df2["year"]
print "="*50

print df2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64


In [123]:
df2[["state","debt","year"]]

Unnamed: 0,state,debt,year
one,Ohio,,2000
two,Ohio,-1.2,2001
three,Ohio,,2002
four,Nevada,-1.5,2001
five,Nevada,-1.7,2002
