## Task
More exploration of pandas basics

## Notebook summary
* Reading/writing text files (CSV, JSON)
* Reading/writing binary files
* Getting data from DB

## References
* *Python for Data Analysis*, Wes McKinney, O'Reilly, 2012
* *Numerical Python*, Robert Johansson, APress, 2015


In [1]:
# display output from all cmds just like Python shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import platform
print 'python.version = ', platform.python_version()
import IPython
print 'ipython.version =', IPython.version_info

import pandas as pd
print 'pandas.version = ', pd.__version__
from pandas import Series, DataFrame

import sys
import json
import tables
import pandas.io.sql as sql
import sqlite3


python.version =  2.7.10
ipython.version = (5, 1, 0, '')
pandas.version =  0.19.1


In [22]:
# Import data from CSV file
!cat sample_data.csv

pd.read_csv('sample_data.csv')
pd.read_csv('sample_data.csv', header=None)
pd.read_csv('sample_data.csv', names=['H1','H2','H3','H4'])
pd.read_csv('sample_data.csv', index_col=0)
pd.read_csv('sample_data.csv', index_col=1)
pd.read_csv('sample_data.csv', index_col=0, skiprows=[2])


pd.read_csv('sample_data.csv', index_col=0, nrows=2)
part = pd.read_csv('sample_data.csv', index_col=0, chunksize=2)
part

for i, p in enumerate(part):
    print 'Part ', i
    p
    
# See also from_csv to read Series data from file


,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33


Unnamed: 0.1,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33


Unnamed: 0,0,1,2,3
0,,Col1,Col2,Col3
1,Row1,Val11,Val12,Val13
2,Row2,Val21,Val22,Val23
3,Row3,Val31,Val32,Val33


Unnamed: 0,H1,H2,H3,H4
0,,Col1,Col2,Col3
1,Row1,Val11,Val12,Val13
2,Row2,Val21,Val22,Val23
3,Row3,Val31,Val32,Val33


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33


Unnamed: 0_level_0,Unnamed: 0,Col2,Col3
Col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Val11,Row1,Val12,Val13
Val21,Row2,Val22,Val23
Val31,Row3,Val32,Val33


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row3,Val31,Val32,Val33


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23


<pandas.io.parsers.TextFileReader at 0x10fab8d10>

Part  0


Unnamed: 0,Col1,Col2,Col3
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23


Part  1


Unnamed: 0,Col1,Col2,Col3
Row3,Val31,Val32,Val33


In [21]:
pd.read_csv('sample_data.csv').to_csv('out.csv')
! cat out.csv
print '---'

pd.read_csv('sample_data.csv').to_csv(sys.stdout)
print '---'

pd.read_csv('sample_data.csv').to_csv(sys.stdout, index=False, header=False)
print '---' 


pd.read_csv('sample_data.csv').to_csv(sys.stdout, index=False, columns=['Col1'])


# Save to binary format
pd.read_csv('sample_data.csv').to_pickle('pickled.out')


,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33
---
,Unnamed: 0,Col1,Col2,Col3
0,Row1,Val11,Val12,Val13
1,Row2,Val21,Val22,Val23
2,Row3,Val31,Val32,Val33
---
Row1,Val11,Val12,Val13
Row2,Val21,Val22,Val23
Row3,Val31,Val32,Val33
---
Col1
Val11
Val21
Val31


In [19]:
# JSON

myjson = """
{
"name": "MyName",
"age": 99,
"city": "MyCity",
"country": "MyCountry"
}
"""

json.loads(myjson)
json.dumps(json.loads(myjson))

DataFrame(json.loads(myjson), index=['Me'])


{u'age': 99, u'city': u'MyCity', u'country': u'MyCountry', u'name': u'MyName'}

'{"city": "MyCity", "age": 99, "name": "MyName", "country": "MyCountry"}'

Unnamed: 0,age,city,country,name
Me,99,MyCity,MyCountry,MyName


In [29]:
# HDF5

myHDF5Store = pd.HDFStore('MyData.h5')
myHDF5Store['item1'] = DataFrame(json.loads(myjson), index=['Me'])
myHDF5Store

myHDF5Store['item1']


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis0] [items->None]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_items] [items->None]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block1_items] [items->None]



<class 'pandas.io.pytables.HDFStore'>
File path: MyData.h5
/item1            frame        (shape->[1,2])

Unnamed: 0,age,city,country,name
Me,99,MyCity,MyCountry,MyName


In [67]:
# SQL 

# Create DB with SQLite

query = """
CREATE TABLE MyTable (
Col1 INT,
Col2 VARCHAR(50),
Col3 FLOAT
);
"""

conn = sqlite3.connect(":memory:")
conn.execute(query)
conn.commit

data = [
    (1, 'This is Row 1', 3.14),
    (2, 'This is Row 2', 4.15),
    (3, 'This is Row 3', 5.16),
    (4, 'This is Row 4', 6.17)
]
statement = "INSERT INTO MyTable VALUES(?,?,?)"
conn.executemany(statement, data)
conn.commit()


# Get data from this DB

cursor = conn.execute('Select * from MyTable')
cursor.description
cursor.description
colnames = zip(*cursor.description)[0]
colnames

rows = cursor.fetchall()
rows

DataFrame(rows, columns=colnames)

print '---'
# Using pandas function requires only a single statement
sql.read_sql_query('SELECT * from MyTable', conn)


<sqlite3.Cursor at 0x109585960>

<function commit>

<sqlite3.Cursor at 0x109585810>

(('Col1', None, None, None, None, None, None),
 ('Col2', None, None, None, None, None, None),
 ('Col3', None, None, None, None, None, None))

(('Col1', None, None, None, None, None, None),
 ('Col2', None, None, None, None, None, None),
 ('Col3', None, None, None, None, None, None))

('Col1', 'Col2', 'Col3')

[(1, u'This is Row 1', 3.14),
 (2, u'This is Row 2', 4.15),
 (3, u'This is Row 3', 5.16),
 (4, u'This is Row 4', 6.17)]

Unnamed: 0,Col1,Col2,Col3
0,1,This is Row 1,3.14
1,2,This is Row 2,4.15
2,3,This is Row 3,5.16
3,4,This is Row 4,6.17


---


Unnamed: 0,Col1,Col2,Col3
0,1,This is Row 1,3.14
1,2,This is Row 2,4.15
2,3,This is Row 3,5.16
3,4,This is Row 4,6.17
