# Getting Started with pandas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

In [4]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## Introduction to pandas Data Structure

### Series

In [None]:
# Series 생성
obj = pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj

In [None]:
# Series values 와 index 조회
obj.values, obj.index

In [None]:
# Null 조회
obj.isnull(), obj.notnull()

In [None]:
# Series의 name 과 index name 설정
obj.name = 'test name'
obj.index.name = 'ind'
obj

### DataFrame

In [None]:
# Dict를이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

In [None]:
df.head(), df.tail()

In [None]:
# Dict data를 이용하여 DataFrame을 만들고 컬럼의 순서를 재조정
pd.DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
# Dict 및 List를 이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
df2

In [None]:
# DataFrame의 column 삭제
# df.copy() 를 명확히 알고 사용하여야 한다
df2['eastern'] = df2.state == 'Ohio'
df2_1 = df2.copy()                   # copy() 사용 여부에 따라 결과값이 달라짐
del df2_1['eastern']
display('df2', 'df2_1')


In [None]:
df2

In [None]:
# Dict를 이용해 컬럼 및 인덱스를 포함하여 DataFrame 생성
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 = pd.DataFrame(pop)
df3

In [None]:
# DataFrame의 index와 columns 조회
df2.index, df2.columns

In [None]:
# DataFrame의 index 이름과 columns 이름 설정
df3.index.name = 'year'
df3.columns.name = 'state'
df3

In [None]:
# DataFrame 행과 열 전환
display("df2", "df2.T")

In [None]:
# DataFramd Data 조회 - 4가지 방법
df2['state'], df2.year, df2.loc['three'], df2.iloc[0]

### Index Object

In [None]:
# Series index 처리
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]

In [None]:
# pd.Index()를 이용한 인덱스 Object 생성
labels = pd.Index(np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
print(obj2)
obj2.index is labels

In [None]:
# index, columns 활용
print('Ohio' in df3.columns)
print(2003 in df3.index)
df3

In [None]:
# index의 중복 허용
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

### ReIndexing

In [None]:
# Series ReIndexing - index 순서 재조정, 없는경우 NaN
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

In [None]:
# pd.reindex() - method='ffill'
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')

In [None]:
# DataFrame reindex() - index 재조정
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
df2 = df.reindex(['a', 'b', 'c', 'd'])
display('df', 'df2')

In [None]:
# DataFrame reindex() - columns 재조정
states = ['Texas', 'Utah', 'California']
df.reindex(columns=states)

### Dropping Entries from an Axis

In [None]:
# Series Data Drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj.drop(['e', 'd'])

In [None]:
# DataFrame 행(row, index) 삭제
df = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

df1 = df.drop(['Colorado', 'Ohio'])
display('df', 'df1')

In [None]:
# DataFrame 열(columns) 삭제
df1 = df.drop('two', axis=1)
display('df', 'df1')

In [None]:
df1 = df.drop(['two', 'four'], axis='columns')
display('df', 'df1')

In [None]:
# 원본 DataFrame 을 직접 변경 drop(, inplace=True)
df1 = df.copy()
df.drop('one', axis=1)
df1.drop('one', axis=1, inplace=True)
display('df', 'df1')

### Indexing, Selection, and Filtering

In [None]:
# Series 조회
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

In [None]:
# DataFrame 조회
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print('~'*40)
print(data['two'])
print('~'*40)
print(data[['three', 'one']])
print('~'*40)
print(data[0:2])
print('~'*40)
print(data[data['three'] > 5])

### Selection with loc and iloc

In [None]:
print(data)
print('~'*40)
print(data.loc['Colorado', ['two', 'three']])
print('~'*40)
print(data.iloc[2, [3, 0, 1]])
print('~'*40)
print(data.iloc[2])
print('~'*40)
print(data.iloc[[1, 2], [3, 0, 1]])


In [None]:
print(data.loc[:'Utah', 'two'])
print('~'*40)
print(data.iloc[:, :3][data.three > 5])

### Integer Indexes

In [None]:
# Label 과 Inter 가 동일한 경우 먼저 Label을 찾도록 설계되어 있음
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

print(ser[:1])
print('~'*40)
print(ser.loc[:1])   # Label Indexing
print('~'*40)
print(ser.iloc[:1])  # Inter Indexing

### Arithmetic and Data Alignment(산술연산과 데이터 정렬)

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print('~'*40)
print(s2)
print('~'*40)
print(s1+s2)


In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df3 = df1 + df2
display('df1', 'df2', 'df3')

In [None]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df3 = df1 - df2
display('df1','df2','df3')

### Arithmetic methods with fill values : 산술 연산 메서드에 채워 넣을 값 지정하기

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
display('df1','df2')

In [None]:
df1 + df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
1/df1

In [None]:
print(df1)
df1.rdiv(1)

In [None]:
print(df1)
print('~'*40)
print(df1.sub(1))
print('~'*40)
print(df1.rsub(1))

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

### Operations between DataFrame and Series

In [None]:
# Index가 동일한 경우 산술 연산
df = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = df.iloc[0]
df - series

In [None]:
df

In [None]:
series

In [None]:
# Index가 다른 경우 산술 연산
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
df + series2

In [None]:
series3 = df['d']
series3
df.sub(series3, axis='index')

In [None]:
df

In [None]:
series3

In [None]:

df.sub(series3)

In [None]:
# 무지 헤깔리는 axis
df.sub(series3, axis='index')

### Function Application and Mapping

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df

In [None]:
# np.abs : 절대값
np.abs(df)

In [None]:
f = lambda x: x.max() - x.min()
df.apply(f)

In [None]:
df.apply(f, axis='columns')

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

df_1 = df.apply(f)
df_2 = df.apply(f, axis=1)
display('df','df_1','df_2')

In [None]:
df

In [None]:
# 요소별 함수의 적용
# applymap() - DataFrame 사용, map() - Series에서 사용
format = lambda x: '%.2f' % x
df_1 = df.applymap(format)
display('df','df_1')

In [None]:
df['e'].map(format)

### Sorting and Ranking

In [None]:
# sort_index() - index를 기준으로 sort
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

In [None]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
df_1 = df.sort_index()
df_2 = df.sort_index(axis=1)
df_3 = df.sort_index(axis=1, ascending=False)
display('df','df_1','df_2','df_3')

In [None]:
# Series - sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

In [None]:
df = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
df_1 = df.sort_values(by='b')
df_2 = df.sort_values(by=['a', 'b'])
display('df','df_1','df_2')



### Ranking
**rank()** - ***특정변수를 기준으로 순위를 구함***

#### 동점 처리 방법(tie-breaking methods)
1. 평균(method='average'), default
2. 최소값(method='min')
3. 최대값(method='max')
4. 첫번째값(method='first')
5. 조밀하게(method='dense')

In [None]:
# rank() - 특정변수를 기준으로 순위를 구함
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

In [None]:
obj.rank(method='min')

In [None]:
# 높은 값일 수록 1순위 - rank()-ascending
obj.rank(ascending=False, method='min')

In [None]:
df = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
df_1 = df.rank(method='first')
df_2 = df.rank(method='first', axis='columns')
display('df','df_1','df_2')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique)
print('~'*40)
print(obj['a'])
print('~'*40)
print(obj['c'])

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.loc['b']

## Summarizing and Computing Descriptive Statistics

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
print(df)
print('~'*40)
print(df.sum())
print('~'*40)
print(df.sum(axis='columns'))
print('~'*40)

# 최대값을 가지는 index 출력
# Series는 index, DataFrame은 index, columns
print(df.idxmax())
print('~'*40)
print(df.cumsum())

In [None]:
## Correlation and Covariance

In [None]:
price = pd.read_pickle('examples/yahoo_price.pkl')
volume = pd.read_pickle('examples/yahoo_volume.pkl')

In [None]:
# 수익률 계산
# return(%) = (현재가격 - 어제가격)/어제가격 * 100
returns = price.pct_change()

In [None]:
# pd.corr() - 데이터간 상관관계
# method = 'pearson'
#        = 'kendall'
#        = 'spearman'
returns['MSFT'].corr(returns['IBM'])

In [None]:
returns['MSFT'].cov(returns['IBM'])

In [None]:
import seaborn as sns 
returns.corr()

In [None]:
# 공분산(Covariance)
# 2개의 확률변수의 상관정도를 나타내는 값
returns['MSFT'].cov(returns['IBM'])


In [None]:
# 지정한 변수와 모든 변수간 상관계수
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

In [None]:
returns

In [None]:
volume

### Unique Values, Value Counts, and Membership

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
print(obj.value_counts())

In [None]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [None]:
# DataFrame 에서 value_counts()
df = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
df_1 = df.apply(pd.value_counts).fillna(0)
display('df','df_1')                     

# Data Loading, Storage and File Formats

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## Reading and Writing Data in Text Format

In [10]:
# Read CSV file to Dataframe
df = pd.read_csv('examples/ex1.csv')
df_1 = pd.read_table('examples/ex1.csv', sep=',')
display('df','df_1')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [12]:
df_1 = pd.read_csv('examples/ex2.csv', header=None)
df_2 = pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd','message'])
display('df_1','df_2')

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
# 컬럼명을 지정하고 한개 컬럼을 index로 지정
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [15]:
# 2의 index 설정
df = pd.read_csv('examples/csv_mindex.csv',
                index_col=['key1', 'key2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [20]:
list(open('examples/ex3.txt'))

['            A         B         C\n',
 '\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 '\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 '\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 '\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [21]:
# 확장자가 csv가 아닌 파일 즉, 데이터를 구분하는 구분자가 쉼표가 아니면 sep 인수를 써서 구분자를 지정
# 구분자의 길이가 정해지지 않은 공백인 경우 \s+ 정규식 문자열을 사용

pd.read_table('examples/ex3.txt', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [25]:
# 자료 파일중에 건너뛰어야 할 행이 있다면 skiprows 인수 사용
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [54]:
result = pd.read_csv('examples/ex5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [55]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [56]:
# NaN 값의 지정
# Pandas가 기본으로 인정하는 NaN : NaN, NULL, NA
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [57]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### Reading Text in Pieces

In [58]:
# pandas options - 출력 max 갯수
pd.options.display.max_rows = 10

In [6]:
df = pd.read_csv('examples/ex6.csv')
df_1 = pd.read_csv('examples/ex6.csv', nrows=5) # Loading Data 갯수 제한
display('df',' df_1')

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [8]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

### Writing Data to Text Format

In [9]:
data = pd.read_csv('examples/ex5.csv')
data.to_csv('examples/out.csv')

In [15]:
import sys
data.to_csv(sys.stdout, sep='|')
print('~'*40)
data.to_csv(sys.stdout, na_rep='NULL')
print('~'*40)
data.to_csv(sys.stdout, index=False, header=False, na_rep='NaN')
print('~'*40)
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])



|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
one,1,2,3.0,4,NaN
two,5,6,NaN,8,world
three,9,10,11.0,12,foo
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a,b,c
1,2,3.0
5,6,
9,10,11.0


In [17]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')

### Working with Delimited Formats

In [19]:
import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [20]:
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

header, values = lines[0], lines[1:]

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict


{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

### JSON Data

In [25]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [28]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [35]:
asjson = json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [36]:
siblings = pd.DataFrame(result['siblings'], columns=['name','age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [37]:
data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [38]:
print(data.to_json())
print(data.to_json(orient='records'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


### XML and HTML: Web Scraping

In [39]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [42]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2001      4
2004      4
2003      3
2007      3
2000      2
Name: Closing Date, dtype: int64

In [None]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

### Parsing XML with lxml.objectify

In [43]:
from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [45]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [46]:
perf = pd.DataFrame(data)
perf.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95.0,96.9,95.0,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95.0,96.0,95.0,95.0
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95.0,96.3,95.0,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95.0,96.8,95.0,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95.0,96.6,95.0,95.8


In [48]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [49]:
root
root.get('href')
root.text

'Google'

### Binary Data Formats

In [51]:
frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')

pd.read_pickle('examples/frame_pickle')


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Using HDF5 format

In [52]:
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [55]:
store['obj1']

Unnamed: 0,a
0,-0.204708
1,0.478943
2,-0.519439
3,-0.555730
4,1.965781
...,...
95,0.795253
96,0.118110
97,-0.748532
98,0.584970


In [56]:
store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()

In [57]:
frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])

Unnamed: 0,a
0,-0.204708
1,0.478943
2,-0.519439
3,-0.55573
4,1.965781


### Reading Microsoft Excel Files

In [60]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [61]:
df = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
df

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [62]:
writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

In [63]:
df.to_excel('examples/ex2.xlsx')


### Interacting with Web APIs

In [66]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

<Response [200]>

In [67]:
data = resp.json()
data[0]['title']

'ENH: Add support for more placeholders in `guess_datetime_format`'

In [None]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

### Interacting with Databases

In [69]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()

In [70]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [71]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [72]:
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [73]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [74]:
!rm mydata.sqlite

rm: cannot remove 'mydata.sqlite': Device or resource busy
