## Task
Explore I/O in pandas

## Notebook summary
* CSV - `read_csv`, `to_csv`, `from_csv`
* JSON - `loads`, `dumps`, `read_json`
* HDF5 - read/write using PyTables
* Binary - pickle files
* DB - import data from DB; requires SQL modules

## References
* *Python for Data Analysis*, Wes McKinney, O'Reilly, 2012
* *Numerical Python*, Robert Johansson, APress, 2015
* *Python Data Science Handbook*, Jake VanderPlas, O'Reilly, 2016


In [32]:
# display output from all cmds just like Python shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import platform
print 'python.version = ', platform.python_version()
import IPython
print 'ipython.version =', IPython.version_info

import pandas as pd
print 'pandas.version = ', pd.__version__
from pandas import Series, DataFrame

import sys
import json
import tables
import pandas.io.sql as sql
import sqlite3


python.version =  2.7.10
ipython.version = (5, 1, 0, '')
pandas.version =  0.19.2


In [15]:
# read_csv - import data from CSV file

print '\n----- Data in input file'
!cat sample_data.csv

print '\n----- read_csv with default options'
pd.read_csv('sample_data.csv')

print '\n----- read_csv with header=None'
pd.read_csv('sample_data.csv', header=None)

print '\n----- read_csv with custom header names'
pd.read_csv('sample_data.csv', names=['H1','H2','H3','H4'])

print '\n----- read_csv with row names in col 0'
pd.read_csv('sample_data.csv', index_col=0)

print '\n----- read_csv with row names in col 1'
pd.read_csv('sample_data.csv', index_col=1)

print '\n----- read_csv with row names in col 0, skip first 2 rows'
pd.read_csv('sample_data.csv', index_col=0, skiprows=[2])

print '\n----- read in chunks'
pd.read_csv('sample_data.csv', index_col=0, nrows=2)
part = pd.read_csv('sample_data.csv', index_col=0, chunksize=2)
print 'part = ', part

for i, p in enumerate(part):
    print 'Part ', i
    p
    
# Note: 
# Both Series and DataFrame have a from_csv() function that reads data from CSV file into the Series or DataFrame.
# It's use is discouraged in favor of read_csv()



----- Data in input file
,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33

----- read_csv with default options


Unnamed: 0.1,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33



----- read_csv with header=None


Unnamed: 0,0,1,2,3
0,,Col1,Col2,Col3
1,Row1,Val11,Val12,Val13
2,Row2,Val21,Val22,Val23
3,Row3,Val31,Val32,Val33



----- read_csv with custom header names


Unnamed: 0,H1,H2,H3,H4
0,,Col1,Col2,Col3
1,Row1,Val11,Val12,Val13
2,Row2,Val21,Val22,Val23
3,Row3,Val31,Val32,Val33



----- read_csv with row names in col 0


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33



----- read_csv with row names in col 1


Unnamed: 0_level_0,Unnamed: 0,Col2,Col3
Col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Val11,Row1,Val12,Val13
Val21,Row2,Val22,Val23
Val31,Row3,Val32,Val33



----- read_csv with row names in col 0, skip first 2 rows


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row3,Val31,Val32,Val33



----- read in chunks


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23


part =  <pandas.io.parsers.TextFileReader object at 0x10ee3a310>
Part  0


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23


Part  1


Unnamed: 0,Col1,Col2,Col3
Row3,Val31,Val32,Val33


In [14]:
# to_csv

print '\n----- Write df to CSV file'
pd.read_csv('sample_data.csv').to_csv('out.csv')
! cat out.csv
print '---'

print '\n----- Write df to stdout'
pd.read_csv('sample_data.csv').to_csv(sys.stdout)
print '---'

print '\n----- Write df to stdout w/o row names & header'
pd.read_csv('sample_data.csv').to_csv(sys.stdout, index=False, header=False)
print '---' 

print '\n----- Write only Col1 to stdout w/o row names'
pd.read_csv('sample_data.csv').to_csv(sys.stdout, index=False, columns=['Col1'])


# Save to binary format
pd.read_csv('sample_data.csv').to_pickle('pickled.out')



----- Write df to CSV file
,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33
---

----- Write df to stdout
,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33
---

----- Write df to stdout w/o row names & header
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33
---

----- Write only Col1 to stdout w/o row names
Col1
Val11
Val21
Val31


In [59]:
# loads, read_json, to_json

myjson = """
[{
"name": "MyName",
"age": 99,
"city": "MyCity",
"country": "MyCountry"
},
{
"name": "YourName",
"age": 100,
"city": "YourTown",
"country": "YourRepublic"
}
]
"""

print '----- import JSON using DataFrame(json.loads)'
json.loads(myjson)
DataFrame(json.loads(myjson), index=['Me', 'You']) 

print '\n----- Import JSON using pd.read_json'
df = pd.read_json(myjson, typ='frame')
df

print '\n----- convert DataFrame to JSON'
df.to_json()

print '\n----- Import JSON time series into pd.Series object'

myjson_ts="""
[
{"2016-01-01": 1.0},
{"2016-01-02": 2.1},
{"2016-01-03": 3.2},
{"2016-01-04": 4.3},
{"2016-01-05": 5.4}
]"""

# using json.loads
Series(json.loads(myjson_ts))
    
# using read_json
pd.read_json(myjson_ts, typ='series')
# ToDo: convert JSON keys into Series index


----- import JSON using DataFrame(json.loads)


[{u'age': 99,
  u'city': u'MyCity',
  u'country': u'MyCountry',
  u'name': u'MyName'},
 {u'age': 100,
  u'city': u'YourTown',
  u'country': u'YourRepublic',
  u'name': u'YourName'}]

Unnamed: 0,age,city,country,name
Me,99,MyCity,MyCountry,MyName
You,100,YourTown,YourRepublic,YourName



----- Import JSON using pd.read_json


Unnamed: 0,age,city,country,name
0,99,MyCity,MyCountry,MyName
1,100,YourTown,YourRepublic,YourName



----- convert DataFrame to JSON


'{"age":{"0":99,"1":100},"city":{"0":"MyCity","1":"YourTown"},"country":{"0":"MyCountry","1":"YourRepublic"},"name":{"0":"MyName","1":"YourName"}}'


----- Import JSON time series into pd.Series object


0    {u'2016-01-01': 1.0}
1    {u'2016-01-02': 2.1}
2    {u'2016-01-03': 3.2}
3    {u'2016-01-04': 4.3}
4    {u'2016-01-05': 5.4}
dtype: object

0    {u'2016-01-01': 1.0}
1    {u'2016-01-02': 2.1}
2    {u'2016-01-03': 3.2}
3    {u'2016-01-04': 4.3}
4    {u'2016-01-05': 5.4}
dtype: object

In [70]:
# HDF5 - pandas uses PyTables module to read/write HDF5

print '----- Empty HDF5 file'
myHDF5Store = pd.HDFStore('MyData.h5')
myHDF5Store

print '\n----- HDF5 file with items'
myHDF5Store['s'] = Series(range(5))
myHDF5Store['df'] = DataFrame(json.loads(myjson), index=['Me', 'You'])

myHDF5Store

myHDF5Store['s']
myHDF5Store['df']

del myHDF5Store['s']
myHDF5Store

myHDF5Store.close()


----- Empty HDF5 file


<class 'pandas.io.pytables.HDFStore'>
File path: MyData.h5
/df                frame        (shape->[2,2])
/item1             frame        (shape->[2,2])
/mylist            series       (shape->[5])  


----- HDF5 file with items


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis0] [items->None]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_items] [items->None]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block1_items] [items->None]



<class 'pandas.io.pytables.HDFStore'>
File path: MyData.h5
/df                frame        (shape->[2,2])
/item1             frame        (shape->[2,2])
/mylist            series       (shape->[5])  
/s                 series       (shape->[5])  

0    0
1    1
2    2
3    3
4    4
dtype: int64

Unnamed: 0,age,city,country,name
Me,99,MyCity,MyCountry,MyName
You,100,YourTown,YourRepublic,YourName


<class 'pandas.io.pytables.HDFStore'>
File path: MyData.h5
/df                frame        (shape->[2,2])
/item1             frame        (shape->[2,2])
/mylist            series       (shape->[5])  

In [75]:
# read_hdf, to_hdf

df = pd.read_hdf('MyData.h5', 'df')
df

df.to_hdf('MyData.h5', 'df/again')

pd.HDFStore('MyData.h5')


Unnamed: 0,age,city,country,name
Me,99,MyCity,MyCountry,MyName
You,100,YourTown,YourRepublic,YourName


<class 'pandas.io.pytables.HDFStore'>
File path: MyData.h5
/df                  frame        (shape->[2,2])
/df/again            frame        (shape->[2,2])
/item1               frame        (shape->[2,2])
/mylist              series       (shape->[5])  

In [80]:
# pickle

s = Series(range(6), index=['A','B','C','D','E','F'])
s
s.to_pickle('MySeries.pkl')

s2 = pd.read_pickle('MySeries.pkl')
s2


A    0
B    1
C    2
D    3
E    4
F    5
dtype: int64

A    0
B    1
C    2
D    3
E    4
F    5
dtype: int64

In [90]:
# SQL 

# Create DB with SQLite
query = """
CREATE TABLE MyTable (
Col1 INT,
Col2 VARCHAR(50),
Col3 FLOAT
);
"""

conn = sqlite3.connect(":memory:")
conn.execute(query)
conn.commit


# Load data to DB
data = [
    (1, 'This is Row 1', 3.14),
    (2, 'This is Row 2', 4.15),
    (3, 'This is Row 3', 5.16),
    (4, 'This is Row 4', 6.17)
]
statement = "INSERT INTO MyTable VALUES(?,?,?)"
conn.executemany(statement, data)
conn.commit()


# Get data from this DB
cursor = conn.execute('Select * from MyTable')
print 'cursor description:'
cursor.description

colnames = zip(*cursor.description)[0]
colnames

rows = cursor.fetchall()
rows

DataFrame(rows, columns=colnames)

print '\n----- Alternately, get DataFrame from SQL using read_sql_query()'
# Using pandas function requires only a single statement
df = sql.read_sql_query('SELECT * from MyTable', conn)
type(df)
df


<sqlite3.Cursor at 0x111659ea0>

<function commit>

<sqlite3.Cursor at 0x11169f2d0>

cursor description:


(('Col1', None, None, None, None, None, None),
 ('Col2', None, None, None, None, None, None),
 ('Col3', None, None, None, None, None, None))

('Col1', 'Col2', 'Col3')

[(1, u'This is Row 1', 3.14),
 (2, u'This is Row 2', 4.15),
 (3, u'This is Row 3', 5.16),
 (4, u'This is Row 4', 6.17)]

Unnamed: 0,Col1,Col2,Col3
0,1,This is Row 1,3.14
1,2,This is Row 2,4.15
2,3,This is Row 3,5.16
3,4,This is Row 4,6.17



----- Alternately, get DataFrame from SQL using read_sql_query()


pandas.core.frame.DataFrame

Unnamed: 0,Col1,Col2,Col3
0,1,This is Row 1,3.14
1,2,This is Row 2,4.15
2,3,This is Row 3,5.16
3,4,This is Row 4,6.17
