In [10]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# 데이터 로딩, 저장, 파일 형식

In [478]:
data = pd.read_csv('pydata/ch06/ex1.csv')
data

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [483]:
pd.read_csv('pydata/ch06/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2.0,3,4,hello
1,5,,7,8,world
2,9,10.0,11,12,foo


In [481]:
pd.read_csv('pydata/ch06/ex2.csv', 
           names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [484]:
pd.read_csv('pydata/ch06/ex2.csv', 
           names=['a','b','c','d','message'],
           index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2.0,3,4
world,5,,7,8
foo,9,10.0,11,12


계층 색인

In [486]:
pd.read_csv('pydata/ch06/csv_mindex.csv',
           index_col=['key1','key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


구분자가 잘 설정되지 않은 파일 읽기

In [491]:
pd.read_csv('pydata/ch06/ex3.txt', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


특정 열 제외하고 읽어오기

In [500]:
def get_skip_rows(filename, pattern):
    """지정한 문자로 시작하는 행 번호 탐지
    Parameters
        filename : 파일명
        pattern: 탐지 대상 문자열.
    
    Returns: 해당하는 행 번호 (0-기반)
    """
    # 파일 내용 읽어오기
    with open(filename) as f:
        lines = f.readlines()
    # 패턴으로 지정된 행 번호 탐지
    skiprows = [i for i, l in enumerate(lines) 
                if l.startswith(pattern)]
    return skiprows

get_skip_rows('pydata/ch06/ex4.csv', '#')

[0, 2, 3]

In [502]:
filename = 'pydata/ch06/ex4.csv'
skiprows = get_skip_rows(filename, '#')
pd.read_csv(filename, skiprows=skiprows)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


누락된 데이터나 이상한 문자

In [503]:
df = pd.read_csv('pydata/ch06/ex5.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [504]:
df.isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [505]:
df = pd.read_csv('pydata/ch06/ex5.csv', na_values=['NULL'])
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [508]:
pd.read_csv('pydata/ch06/ex5.csv',
              na_values={'message': ['foo', 'NA'],
                        'something': ['two']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


텍스트 파일 일부 읽기

In [509]:
pd.read_csv('pydata/ch06/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [512]:
chunks = pd.read_csv('pydata/ch06/ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunks:
    tot= tot.add(piece['key'].value_counts(), fill_value=0)

tot.order(ascending=False)[:10]

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: float64

## 데이터 텍스트 출력

In [513]:
data = pd.read_csv('pydata/ch06/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [514]:
data.to_csv('out.csv')

In [516]:
data.to_csv('out.csv', sep=':')

In [517]:
data.to_csv('out.csv', na_rep='NULL')

In [519]:
# 이전 파일에 이어쓰기
data.to_csv('out.csv', mode='a')

In [521]:
import sys
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [524]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [528]:
date = pd.date_range('2015-07-16', periods=7)
Series(np.arange(7), index=date).to_csv('tseries.csv')

In [541]:
df = pd.read_csv('tseries.csv',header=None).set_index([0])
Series(df[1].values, index=df.index)

0
2015-07-16    0
2015-07-17    1
2015-07-18    2
2015-07-19    3
2015-07-20    4
2015-07-21    5
2015-07-22    6
dtype: int64

In [555]:
ts = Series.from_csv('tseries.csv', parse_dates=True)
ts

2015-07-16    0
2015-07-17    1
2015-07-18    2
2015-07-19    3
2015-07-20    4
2015-07-21    5
2015-07-22    6
dtype: int64

In [556]:
ts.index

<class 'pandas.tseries.index.DatetimeIndex'>
[2015-07-16, ..., 2015-07-22]
Length: 7, Freq: None, Timezone: None

In [551]:
Series.from_csv?

## JSON

In [563]:
json_str = """
{"names": "Wes",
"places_lived": ["USA", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
        {"name": "Katie", "age": 33, "pet": "Cisco"}]
}"""

In [564]:
import json
# JSON 문자열을 파이썬 dict 객체로 변환
json_dict = json.loads(json_str)
json_dict

{u'names': u'Wes',
 u'pet': None,
 u'places_lived': [u'USA', u'Spain', u'Germany'],
 u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
  {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}

In [565]:
json.dumps(json_dict)

'{"pet": null, "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 33, "name": "Katie"}], "names": "Wes", "places_lived": ["USA", "Spain", "Germany"]}'

# 데이터 합치기

In [260]:
df1 = DataFrame({'key': ['b','b','a','c','a','a','b'],
                'data1': range(7)})
df2 = DataFrame({'key':['a','b','d'],
                'data2': range(3)})

In [261]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [262]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [263]:
pd.merge(df2, df1)

Unnamed: 0,data2,key,data1
0,0,a,2
1,0,a,4
2,0,a,5
3,1,b,0
4,1,b,1
5,1,b,6


In [265]:
df3 = DataFrame({'lkey': ['b','b','a','c','a','a','b'],
                'data1': range(7)})
df4 = DataFrame({'rkey':['a','b','d'],
                'data2': range(3)})

In [266]:
df3

Unnamed: 0,data1,lkey
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [267]:
df4

Unnamed: 0,data2,rkey
0,0,a
1,1,b
2,2,d


In [268]:
pd.merge(df3, df4)

MergeError: No common columns to perform merge on

In [269]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


## 축 따라 이어붙이기

In [270]:
s1 = Series([0,1], index=['a','b'])
s2 = Series([2,3,4], index=['c','d', 'e'])
s3 = Series([5,6], index=['f','g'])

In [271]:
s1

a    0
b    1
dtype: int64

In [272]:
s2

c    2
d    3
e    4
dtype: int64

In [273]:
s3

f    5
g    6
dtype: int64

In [274]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [275]:
df1 = DataFrame(np.arange(6).reshape(3,2),
               index=['a','b','c'], columns=['one', 'two'])
df2 = DataFrame(np.arange(4).reshape(2,2),
               index=['a','c'], columns=['three', 'four'])

In [276]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [277]:
df2

Unnamed: 0,three,four
a,0,1
c,2,3


In [280]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [281]:
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
          index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
          index=['f', 'e', 'd', 'c', 'b', 'a'])

In [282]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [286]:
b

f     0
e     1
d     2
c     3
b     4
a   NaN
dtype: float64

In [285]:
b[-1]=np.nan

In [289]:
# 각 인덱스에서, a의 값이 없는 경우는 b의 해당 인덱스의 값을 가져온다
np.where(pd.isnull(a), b, a)

array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

In [288]:
pd.isnull(a)

f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool

# 데이터베이스 입출력