## Data Retrieval
 - Install the following python modules
 - html5lib xlrd openpyxl sqlalchemy pymysql pymongo lxml

### CSV and Text files
 - read_csv, read_table, to_csv

In [1]:
import numpy  as np
import pandas as pd

In [2]:
%cat read_write_data/file_01.csv

white,red,blue,green,animal
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse


In [3]:
df = pd.read_csv('read_write_data/file_01.csv')
df

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [4]:
df.shape

(5, 5)

In [5]:
pd.read_table('read_write_data/file_01.csv')

Unnamed: 0,"white,red,blue,green,animal"
0,"1,5,2,3,cat"
1,"2,7,8,5,dog"
2,"3,3,6,7,horse"
3,"2,2,8,3,duck"
4,"4,4,2,1,mouse"


In [6]:
pd.read_table('read_write_data/file_01.csv', sep=',')

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [7]:
# header

%cat read_write_data/file_02.csv

1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse


In [8]:
pd.read_csv('read_write_data/file_02.csv')

Unnamed: 0,1,5,2,3,cat
0,2,7,8,5,dog
1,3,3,6,7,horse
2,2,2,8,3,duck
3,4,4,2,1,mouse


In [11]:
pd.read_csv('read_write_data/file_02.csv', skiprows=2, header=None)

Unnamed: 0,0,1,2,3,4
0,3,3,6,7,horse
1,2,2,8,3,duck
2,4,4,2,1,mouse


In [12]:
pd.read_csv('read_write_data/file_02.csv', skiprows=2, nrows=1, 
            header=None)

Unnamed: 0,0,1,2,3,4
0,3,3,6,7,horse


In [10]:
pd.read_csv('read_write_data/file_02.csv', 
            names=['white','red','blue','green','animal'])

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [11]:
# Hierarchical structure

%cat read_write_data/file_03.csv

color,status,item1,item2,item3
black,up,3,4,6
black,down,2,6,7
white,up,5,5,5
white,down,3,3,2
white,left,1,2,1
red,up,2,2,2
red,down,1,1,4


In [12]:
df = pd.read_csv('read_write_data/file_03.csv')
df

Unnamed: 0,color,status,item1,item2,item3
0,black,up,3,4,6
1,black,down,2,6,7
2,white,up,5,5,5
3,white,down,3,3,2
4,white,left,1,2,1
5,red,up,2,2,2
6,red,down,1,1,4


In [13]:
df = pd.read_csv('read_write_data/file_03.csv', 
                 index_col=['color','status'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,item1,item2,item3
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,up,3,4,6
black,down,2,6,7
white,up,5,5,5
white,down,3,3,2
white,left,1,2,1
red,up,2,2,2
red,down,1,1,4


In [14]:
df.shape

(7, 3)

In [15]:
df.loc['black']

Unnamed: 0_level_0,item1,item2,item3
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
up,3,4,6
down,2,6,7


In [16]:
df.loc['black', 'down']

item1    2
item2    6
item3    7
Name: (black, down), dtype: int64

In [17]:
# spaces or tabs in random order

%cat read_write_data/file_04.txt

white red   blue	green
1 5		2 3
2 7	8  5
  3  3  6   7

In [18]:
pd.read_table('read_write_data/file_04.txt')

Unnamed: 0,white red blue,green
1 5,,2 3
2 7,8 5,
3 3 6 7,,


In [19]:
# Use regular expression for separator

pd.read_table('read_write_data/file_04.txt', sep='\s+',
             engine='python')

Unnamed: 0,white,red,blue,green
0,1,5,2,3
1,2,7,8,5
2,3,3,6,7


In [20]:
%cat read_write_data/file_05.txt

000END123AAA122
001END124BBB321
002END125CCC333

In [21]:
# Extract numerical parts

pd.read_table('read_write_data/file_05.txt', sep='\D+', 
              header=None, engine='python')

Unnamed: 0,0,1,2
0,0,123,122
1,1,124,321
2,2,125,333


In [22]:
%cat read_write_data/file_06.txt

########### LOG FILE ############
This file has been generated by automatic system
white,red,blue,green,animal
12-Feb-2015: Counting of animals inside the house
1,5,2,3,cat
2,7,8,5,dog
13-Feb-2015: Counting of animals outside the house
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse


In [23]:
# Skip lines

pd.read_table('read_write_data/file_06.txt', sep=',',
              skiprows=[0,1,3,6])

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [24]:
pd.read_csv('read_write_data/file_06.txt',
              skiprows=[0,1,3,6])

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [17]:
%cat read_write_data/file_02.csv

1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse


In [16]:
# read only a portion of the file

pd.read_csv('read_write_data/file_02.csv',
              skiprows=2, header=None)


Unnamed: 0,0,1,2,3,4
0,3,3,6,7,horse
1,2,2,8,3,duck
2,4,4,2,1,mouse


In [15]:
pd.read_csv('read_write_data/file_02.csv',
              skiprows=2, nrows=1, header=None)



Unnamed: 0,0,1,2,3,4
0,3,3,6,7,horse


In [14]:
%cat read_write_data/file_01.csv

white,red,blue,green,animal
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse


In [13]:
# read in chunks

out = []

pieces = pd.read_csv('read_write_data/file_01.csv',chunksize=2)

for piece in pieces:
    print(piece, '\n')
    out.append(piece['red'].sum())
    
print(out)

   white  red  blue  green animal
0      1    5     2      3    cat
1      2    7     8      5    dog 

   white  red  blue  green animal
2      3    3     6      7  horse
3      2    2     8      3   duck 

   white  red  blue  green animal
4      4    4     2      1  mouse 

[12, 5, 4]


In [20]:
# Using List Comprehension

pieces = pd.read_csv('read_write_data/file_01.csv',chunksize=2)

[piece['red'].sum() for piece in pieces]

[12, 5, 4]

### Writing Data in CSV format

In [21]:
frame = pd.DataFrame(
    np.arange(16).reshape((4,4)),
    index=['red', 'blue', 'yellow', 'white'],
    columns=['ball','pen','pencil','paper'])

frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [22]:
frame.to_csv('read_write_data/file_07.csv')

%cat read_write_data/file_07.csv

,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [23]:
frame.to_csv('read_write_data/file_07b.csv',
            header=True, index=False)

%cat read_write_data/file_07b.csv

ball,pen,pencil,paper
0,1,2,3
4,5,6,7
8,9,10,11
12,13,14,15


In [24]:
frame.to_csv('read_write_data/file_07c.csv',
            header=False, index=False)

%cat read_write_data/file_07c.csv

0,1,2,3
4,5,6,7
8,9,10,11
12,13,14,15


In [25]:
frame3 = pd.DataFrame(    # Missing values
    [
        [6,np.nan,np.nan,6,np.nan],
        [8,np.nan,np.nan,np.nan,np.nan],
        [10,np.nan,np.nan,np.nan,np.nan],
        [20,np.nan,np.nan,20.0,np.nan],
        [22,np.nan,np.nan,19.0,np.nan]
    ],
    index=['blue','green','red','white','yellow'],
    columns=['ball','mug','paper','pen','pencil'])

frame3

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6,,,6.0,
green,8,,,,
red,10,,,,
white,20,,,20.0,
yellow,22,,,19.0,


In [26]:
frame3.to_csv('read_write_data/file_08.csv',
            header=True, index=True)

%cat read_write_data/file_08.csv

,ball,mug,paper,pen,pencil
blue,6,,,6.0,
green,8,,,,
red,10,,,,
white,20,,,20.0,
yellow,22,,,19.0,


In [27]:
df = pd.read_csv('read_write_data/file_08.csv', index_col=0)
df

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6,,,6.0,
green,8,,,,
red,10,,,,
white,20,,,20.0,
yellow,22,,,19.0,


In [28]:
df.dropna(axis=1) # 0=rows, 1=columns

Unnamed: 0,ball
blue,6
green,8
red,10
white,20
yellow,22


In [29]:
frame3.to_csv('read_write_data/file_08b.csv', na_rep='Nan',
            header=True, index=True)

%cat read_write_data/file_08b.csv

,ball,mug,paper,pen,pencil
blue,6,Nan,Nan,6.0,Nan
green,8,Nan,Nan,Nan,Nan
red,10,Nan,Nan,Nan,Nan
white,20,Nan,Nan,20.0,Nan
yellow,22,Nan,Nan,19.0,Nan


In [30]:
df = pd.read_csv('read_write_data/file_08b.csv', index_col=0)
df

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6,Nan,Nan,6.0,Nan
green,8,Nan,Nan,Nan,Nan
red,10,Nan,Nan,Nan,Nan
white,20,Nan,Nan,20.0,Nan
yellow,22,Nan,Nan,19.0,Nan


In [31]:
df.dropna(axis=1)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6,Nan,Nan,6.0,Nan
green,8,Nan,Nan,Nan,Nan
red,10,Nan,Nan,Nan,Nan
white,20,Nan,Nan,20.0,Nan
yellow,22,Nan,Nan,19.0,Nan


In [32]:
df = pd.read_csv('read_write_data/file_08b.csv', index_col=0, na_values='Nan')
df

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6,,,6.0,
green,8,,,,
red,10,,,,
white,20,,,20.0,
yellow,22,,,19.0,


In [33]:
df.dropna(axis=1)

Unnamed: 0,ball
blue,6
green,8
red,10
white,20
yellow,22


### Writing Data to HTML

In [34]:
frame = pd.DataFrame(np.arange(10,14).reshape(2,2))
frame

Unnamed: 0,0,1
0,10,11
1,12,13


In [35]:
print(frame.to_html()) # HTML table representation of data

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>10</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>12</td>
      <td>13</td>
    </tr>
  </tbody>
</table>


In [37]:
frame = pd.DataFrame( np.random.random((4,4)),
                    index = ['white','black','red','blue'],
                    columns = ['up','down','right','left'])
frame

Unnamed: 0,up,down,right,left
white,0.680023,0.737935,0.812954,0.954666
black,0.089082,0.555954,0.128188,0.079348
red,0.219172,0.687071,0.474565,0.608383
blue,0.588378,0.570997,0.905281,0.393619


In [38]:
s = ['<HTML>']
s.append('<HEAD><TITLE>My DataFrame</TITLE></HEAD>')
s.append('<BODY>')
s.append(frame.to_html())
s.append('</BODY></HTML>')
html = ''.join(s)

In [40]:
html_file = open('read_write_data/myFrame.html','w')
html_file.write(html)
html_file.close()

%cat read_write_data/myFrame.html

<HTML><HEAD><TITLE>My DataFrame</TITLE></HEAD><BODY><table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>up</th>
      <th>down</th>
      <th>right</th>
      <th>left</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>white</th>
      <td>0.680023</td>
      <td>0.737935</td>
      <td>0.812954</td>
      <td>0.954666</td>
    </tr>
    <tr>
      <th>black</th>
      <td>0.089082</td>
      <td>0.555954</td>
      <td>0.128188</td>
      <td>0.079348</td>
    </tr>
    <tr>
      <th>red</th>
      <td>0.219172</td>
      <td>0.687071</td>
      <td>0.474565</td>
      <td>0.608383</td>
    </tr>
    <tr>
      <th>blue</th>
      <td>0.588378</td>
      <td>0.570997</td>
      <td>0.905281</td>
      <td>0.393619</td>
    </tr>
  </tbody>
</table></BODY></HTML>

### Reading from HTML

In [2]:
import pandas as pd
frames = pd.read_html('read_write_data/myFrame.html',
              index_col=0,
              flavor='html5lib')
frames[0]

Unnamed: 0,up,down,right,left
white,0.680023,0.737935,0.812954,0.954666
black,0.089082,0.555954,0.128188,0.079348
red,0.219172,0.687071,0.474565,0.608383
blue,0.588378,0.570997,0.905281,0.393619


In [3]:
frames = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html',
                      flavor='html5lib')
len(frames)

1

In [4]:
frames[0]

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Resolute Bank,Maumee,OH,58317,Buckeye State Bank,"October 25, 2019","October 25, 2019"
1,Louisa Community Bank,Louisa,KY,58112,Kentucky Farmers Bank Corporation,"October 25, 2019","October 28, 2019"
2,The Enloe State Bank,Cooper,TX,10716,"Legend Bank, N. A.","May 31, 2019","August 22, 2019"
3,Washington Federal Bank for Savings,Chicago,IL,30570,Royal Savings Bank,"December 15, 2017","July 24, 2019"
4,The Farmers and Merchants State Bank of Argonia,Argonia,KS,17719,Conway Bank,"October 13, 2017","August 12, 2019"
...,...,...,...,...,...,...,...
553,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB","July 27, 2001","August 19, 2014"
554,Malta National Bank,Malta,OH,6629,North Valley Bank,"May 3, 2001","November 18, 2002"
555,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,"February 2, 2001","February 18, 2003"
556,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,"December 14, 2000","March 17, 2005"


In [5]:
frames = pd.read_html('https://projects.fivethirtyeight.com/global-club-soccer-rankings/?ex_cid=irpromo',
                     flavor='html5lib')
len(frames)
    

1

In [7]:
frames[0]

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Team rating,Team rating,Team rating
Unnamed: 0_level_1,Rank,1-week change,team,League,League country,off.,def.,spi
0,1,,Man. City,Premier League,England,3.4,0.2,95.4
1,2,1.0,Liverpool,Premier League,England,2.9,0.3,92.9
2,3,-1.0,Bayern Munich,Bundesliga,Germany,3.2,0.4,92.5
3,4,,PSG,Ligue 1,France,2.9,0.4,91.0
4,5,,Barcelona,La Liga,Spain,2.9,0.5,89.5
...,...,...,...,...,...,...,...,...
624,625,3.0,Port Vale,League Two,England,0.2,2.2,7.0
625,626,1.0,Walsall,League Two,England,0.2,2.3,6.4
626,627,-2.0,Macclesfield,League Two,England,0.2,2.3,6.4
627,628,-2.0,Morecambe,League Two,England,0.2,2.5,5.7


In [8]:
frames[0]['Team rating']

Unnamed: 0,off.,def.,spi
0,3.4,0.2,95.4
1,2.9,0.3,92.9
2,3.2,0.4,92.5
3,2.9,0.4,91.0
4,2.9,0.5,89.5
...,...,...,...
624,0.2,2.2,7.0
625,0.2,2.3,6.4
626,0.2,2.3,6.4
627,0.2,2.5,5.7


In [9]:
frames[0].xs("team", level=1, axis=1)

Unnamed: 0,Unnamed: 2_level_0
0,Man. City
1,Liverpool
2,Bayern Munich
3,PSG
4,Barcelona
...,...
624,Port Vale
625,Walsall
626,Macclesfield
627,Morecambe


In [10]:
idx = pd.IndexSlice
idx

frames[0].loc[:, idx[:, 'team']]

Unnamed: 0_level_0,Unnamed: 2_level_0
Unnamed: 0_level_1,team
0,Man. City
1,Liverpool
2,Bayern Munich
3,PSG
4,Barcelona
...,...
624,Port Vale
625,Walsall
626,Macclesfield
627,Morecambe


### JSON Data

In [12]:
import numpy as np
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['white','black','red','blue'],
                    columns=['up','down','right','left'])
frame

Unnamed: 0,up,down,right,left
white,0,1,2,3
black,4,5,6,7
red,8,9,10,11
blue,12,13,14,15


In [13]:
frame.to_json('read_write_data/frame1.json', orient='columns')
%cat read_write_data/frame1.json

{"up":{"white":0,"black":4,"red":8,"blue":12},"down":{"white":1,"black":5,"red":9,"blue":13},"right":{"white":2,"black":6,"red":10,"blue":14},"left":{"white":3,"black":7,"red":11,"blue":15}}

In [14]:
pd.read_json('read_write_data/frame1.json', orient='columns')

Unnamed: 0,up,down,right,left
white,0,1,2,3
black,4,5,6,7
red,8,9,10,11
blue,12,13,14,15


In [15]:
frame.to_json('read_write_data/frame2.json', orient="index")
%cat read_write_data/frame2.json

{"white":{"up":0,"down":1,"right":2,"left":3},"black":{"up":4,"down":5,"right":6,"left":7},"red":{"up":8,"down":9,"right":10,"left":11},"blue":{"up":12,"down":13,"right":14,"left":15}}

In [16]:
pd.read_json('read_write_data/frame2.json', orient="index")

Unnamed: 0,down,left,right,up
black,5,7,6,4
blue,13,15,14,12
red,9,11,10,8
white,1,3,2,0


In [17]:
frame.to_json('read_write_data/frame3.json', orient="records")
%cat read_write_data/frame3.json

[{"up":0,"down":1,"right":2,"left":3},{"up":4,"down":5,"right":6,"left":7},{"up":8,"down":9,"right":10,"left":11},{"up":12,"down":13,"right":14,"left":15}]

In [18]:
frame.to_json('read_write_data/frame4.json', orient="values")
%cat read_write_data/frame4.json

[[0,1,2,3],[4,5,6,7],[8,9,10,11],[12,13,14,15]]

In [19]:
frame.to_json('read_write_data/frame5.json', orient="split")
%cat read_write_data/frame5.json

{"columns":["up","down","right","left"],"index":["white","black","red","blue"],"data":[[0,1,2,3],[4,5,6,7],[8,9,10,11],[12,13,14,15]]}

In [20]:
# more general json data  -- cool

%cat read_write_data/books.json

[
  {"writer": "Mark Ross",
    "nationality": "USA",
    "books": [
         {"title": "XML Cookbook", "price": 23.56},
         {"title": "Python Fundamentals", "price": 50.70},
         {"title": "The NumPy library", "price": 12.30}
        ] 
   }, 

  {"writer": "Barbara Bracket",
    "nationality": "UK",
    "books": [
         {"title": "Java Enterprise", "price": 28.60},
         {"title": "HTML5", "price": 31.35},
         {"title": "Python for Dummies", "price": 28.00}
        ] 
   }
 ] 


In [21]:
from pandas.io.json import json_normalize, loads

In [22]:
file = open('read_write_data/books.json', 'r')
text = file.read()
text = loads(text)
print(text)

[{'writer': 'Mark Ross', 'nationality': 'USA', 'books': [{'title': 'XML Cookbook', 'price': 23.56}, {'title': 'Python Fundamentals', 'price': 50.7}, {'title': 'The NumPy library', 'price': 12.3}]}, {'writer': 'Barbara Bracket', 'nationality': 'UK', 'books': [{'title': 'Java Enterprise', 'price': 28.6}, {'title': 'HTML5', 'price': 31.35}, {'title': 'Python for Dummies', 'price': 28.0}]}]


In [23]:
json_normalize(text, 'books')

Unnamed: 0,title,price
0,XML Cookbook,23.56
1,Python Fundamentals,50.7
2,The NumPy library,12.3
3,Java Enterprise,28.6
4,HTML5,31.35
5,Python for Dummies,28.0


In [24]:
json_normalize(text, 'books', 'writer')

Unnamed: 0,title,price,writer
0,XML Cookbook,23.56,Mark Ross
1,Python Fundamentals,50.7,Mark Ross
2,The NumPy library,12.3,Mark Ross
3,Java Enterprise,28.6,Barbara Bracket
4,HTML5,31.35,Barbara Bracket
5,Python for Dummies,28.0,Barbara Bracket


In [25]:
frame = json_normalize(text, 'books', ['nationality', 'writer'])
frame

Unnamed: 0,title,price,nationality,writer
0,XML Cookbook,23.56,USA,Mark Ross
1,Python Fundamentals,50.7,USA,Mark Ross
2,The NumPy library,12.3,USA,Mark Ross
3,Java Enterprise,28.6,UK,Barbara Bracket
4,HTML5,31.35,UK,Barbara Bracket
5,Python for Dummies,28.0,UK,Barbara Bracket


In [26]:
frame.set_index(['writer', 'nationality'])

Unnamed: 0_level_0,Unnamed: 1_level_0,title,price
writer,nationality,Unnamed: 2_level_1,Unnamed: 3_level_1
Mark Ross,USA,XML Cookbook,23.56
Mark Ross,USA,Python Fundamentals,50.7
Mark Ross,USA,The NumPy library,12.3
Barbara Bracket,UK,Java Enterprise,28.6
Barbara Bracket,UK,HTML5,31.35
Barbara Bracket,UK,Python for Dummies,28.0


### XML Data

In [27]:
%cat read_write_data/books.xml

<?xml version="1.0"?>
<Catalog>
   <Book id="ISBN9872122367564">
      <Author>Ross, Mark</Author>
      <Title>XML Cookbook</Title>
      <Genre>Computer</Genre>
      <Price>23.56</Price>
      <PublishDate>2014-22-01</PublishDate>
   </Book>
   <Book id="ISBN9872122367564">
      <Author>Bracket, Barbara</Author>
      <Title>XML for Dummies</Title>
      <Genre>Computer</Genre>
      <Price>35.95</Price>
      <PublishDate>2014-12-16</PublishDate>
   </Book>
</Catalog>


In [28]:
from lxml import objectify

In [29]:
xml = objectify.parse('read_write_data/books.xml')
xml

<lxml.etree._ElementTree at 0x11a188088>

In [30]:
root = xml.getroot()
root

<Element Catalog at 0x11b0a5708>

In [31]:
root.Book

<Element Book at 0x108f4d488>

In [32]:
root.Book.Author

'Ross, Mark'

In [33]:
root.getchildren()

[<Element Book at 0x108f4d488>, <Element Book at 0x11b0e9d88>]

In [34]:
[book.Author for book in root.getchildren()]

['Ross, Mark', 'Bracket, Barbara']

In [35]:
[child.tag for child in root.Book.getchildren()]  # tags are XML tags (properties of the book)

['Author', 'Title', 'Genre', 'Price', 'PublishDate']

In [36]:
[child.text for child in root.Book.getchildren()]

['Ross, Mark', 'XML Cookbook', 'Computer', '23.56', '2014-22-01']

In [37]:
root.Book.attrib.keys()

['id']

In [38]:
# Assuming at least one child

def etree2df(root):
    
    column_names = root.getchildren()[0].attrib.keys()
    
    column_names += [child.tag for child in root.getchildren()[0].getchildren()]
    
    xmlframe = pd.DataFrame(columns=column_names)
    
    for j in range(0, len(root.getchildren())):
        
        obj = root.getchildren()[j]
        
        texts = obj.attrib.values()
        texts += [child.text for child in obj.getchildren()]
        
        row = dict(zip(column_names, texts))
        row_s = pd.Series(row)
        row_s.name = j
        
        xmlframe = xmlframe.append(row_s)
    
    return xmlframe

In [39]:
etree2df(root)  # XML to dataframe

Unnamed: 0,id,Author,Title,Genre,Price,PublishDate
0,ISBN9872122367564,"Ross, Mark",XML Cookbook,Computer,23.56,2014-22-01
1,ISBN9872122367564,"Bracket, Barbara",XML for Dummies,Computer,35.95,2014-12-16


### Excel Data

In [40]:
pd.read_excel('read_write_data/file01_data.xlsx', index_col=0)

Unnamed: 0,white,red,green,black
a,12,23,17,18
b,22,16,19,18
c,14,23,22,21


In [41]:
pd.read_excel('read_write_data/file01_data.xlsx', 'Sheet2', index_col=0) # Sheet2 by name

Unnamed: 0,yellow,purple,blue,orange
A,11,16,44,22
B,20,22,23,44
C,30,31,37,32


In [42]:
# use index - 0, 1, ...

pd.read_excel('read_write_data/file01_data.xlsx', 1, index_col=0) # Sheet2 by index

Unnamed: 0,yellow,purple,blue,orange
A,11,16,44,22
B,20,22,23,44
C,30,31,37,32


In [43]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['white','black','red','blue'],
                    columns=['up','down','right','left'])
frame

Unnamed: 0,up,down,right,left
white,0,1,2,3
black,4,5,6,7
red,8,9,10,11
blue,12,13,14,15


In [44]:
frame.to_excel('read_write_data/file02_data.xlsx')

In [45]:
pd.read_excel('read_write_data/file02_data.xlsx', index_col=0)

Unnamed: 0,up,down,right,left
white,0,1,2,3
black,4,5,6,7
red,8,9,10,11
blue,12,13,14,15


### Pickle - Python Object Serialization

In [46]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['up','down','left','right'])
frame

Unnamed: 0,0,1,2,3
up,0,1,2,3
down,4,5,6,7
left,8,9,10,11
right,12,13,14,15


In [47]:
frame.to_pickle('read_write_data/frame.pkl')

In [48]:
pd.read_pickle('read_write_data/frame.pkl')

Unnamed: 0,0,1,2,3
up,0,1,2,3
down,4,5,6,7
left,8,9,10,11
right,12,13,14,15


### Databases

In [49]:
from sqlalchemy import create_engine
from pandas.io import sql

In [50]:
engine = create_engine('sqlite:///foo.db')

In [51]:
frame = pd.DataFrame(
    np.arange(20).reshape(4,5),
    columns=['white','red','blue','black','green'])

frame

Unnamed: 0,white,red,blue,black,green
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [52]:
sql.execute('DROP TABLE IF EXISTS colors', engine) # remove COLORS from database
frame.to_sql('colors',engine, index=False) # add table 'colors' to the database

In [53]:
pd.read_sql('colors', engine)

Unnamed: 0,white,red,blue,black,green
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [54]:
pd.read_sql_query('SELECT white, blue FROM colors', engine)

Unnamed: 0,white,blue
0,0,2
1,5,7
2,10,12
3,15,17


In [55]:
pd.read_sql_query('SELECT name FROM sqlite_master WHERE type="table";', engine) # what tables exist

Unnamed: 0,name
0,colors


### MongoDB database

In [56]:
from pymongo import MongoClient

In [57]:
url = 'mongodb://cs602_user:cs602_secret@ds115768.mlab.com:15768/cs602db';

client = MongoClient(url)

In [58]:
db = client.cs602db
db

Database(MongoClient(host=['ds115768.mlab.com:15768'], document_class=dict, tz_aware=False, connect=True), 'cs602db')

In [59]:
collection = db['zipcodes']
collection

Collection(Database(MongoClient(host=['ds115768.mlab.com:15768'], document_class=dict, tz_aware=False, connect=True), 'cs602db'), 'zipcodes')

In [60]:
len(list(collection.find()))

29353

In [61]:
list(collection.find())[:10]

[{'_id': '01012',
  'city': 'CHESTERFIELD',
  'loc': [-72.833309, 42.38167],
  'pop': 177,
  'state': 'MA'},
 {'_id': '01010',
  'city': 'BRIMFIELD',
  'loc': [-72.188455, 42.116543],
  'pop': 3706,
  'state': 'MA'},
 {'_id': '01020',
  'city': 'CHICOPEE',
  'loc': [-72.576142, 42.176443],
  'pop': 31495,
  'state': 'MA'},
 {'_id': '01013',
  'city': 'CHICOPEE',
  'loc': [-72.607962, 42.162046],
  'pop': 23396,
  'state': 'MA'},
 {'_id': '01007',
  'city': 'BELCHERTOWN',
  'loc': [-72.410953, 42.275103],
  'pop': 10579,
  'state': 'MA'},
 {'_id': '01011',
  'city': 'CHESTER',
  'loc': [-72.988761, 42.279421],
  'pop': 1688,
  'state': 'MA'},
 {'_id': '01026',
  'city': 'CUMMINGTON',
  'loc': [-72.905767, 42.435296],
  'pop': 1484,
  'state': 'MA'},
 {'_id': '01028',
  'city': 'EAST LONGMEADOW',
  'loc': [-72.505565, 42.067203],
  'pop': 13367,
  'state': 'MA'},
 {'_id': '01027',
  'city': 'MOUNT TOM',
  'loc': [-72.679921, 42.264319],
  'pop': 16864,
  'state': 'MA'},
 {'_id': '01022',

In [64]:
zipcodes = pd.DataFrame(list(collection.find()), columns=['state', 'city', '_id', 'loc', 'pop'])

print(zipcodes.head())
zipcodes = zipcodes.set_index(['state', 'city'])

zipcodes.loc['MA'].loc['BOSTON']

  state          city    _id                      loc    pop
0    MA  CHESTERFIELD  01012   [-72.833309, 42.38167]    177
1    MA     BRIMFIELD  01010  [-72.188455, 42.116543]   3706
2    MA      CHICOPEE  01020  [-72.576142, 42.176443]  31495
3    MA      CHICOPEE  01013  [-72.607962, 42.162046]  23396
4    MA   BELCHERTOWN  01007  [-72.410953, 42.275103]  10579


Unnamed: 0_level_0,_id,loc,pop
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BOSTON,2108,"[-71.068432, 42.357603]",3697
BOSTON,2109,"[-71.053386, 42.362963]",3926
BOSTON,2111,"[-71.0629, 42.350348]",3759
BOSTON,2115,"[-71.092215, 42.342706]",25597
BOSTON,2110,"[-71.051417, 42.357636]",957
BOSTON,2113,"[-71.055958, 42.365656]",6698
BOSTON,2114,"[-71.06823, 42.361111]",10246
BOSTON,2116,"[-71.076798, 42.349201]",17459
BOSTON,2199,"[-71.082543, 42.347873]",886
BOSTON,2210,"[-71.046511, 42.348921]",308


In [63]:
zipcodes.loc[('MA','BOSTON')]

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,_id,loc,pop
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MA,BOSTON,2108,"[-71.068432, 42.357603]",3697
MA,BOSTON,2109,"[-71.053386, 42.362963]",3926
MA,BOSTON,2111,"[-71.0629, 42.350348]",3759
MA,BOSTON,2115,"[-71.092215, 42.342706]",25597
MA,BOSTON,2110,"[-71.051417, 42.357636]",957
MA,BOSTON,2113,"[-71.055958, 42.365656]",6698
MA,BOSTON,2114,"[-71.06823, 42.361111]",10246
MA,BOSTON,2116,"[-71.076798, 42.349201]",17459
MA,BOSTON,2199,"[-71.082543, 42.347873]",886
MA,BOSTON,2210,"[-71.046511, 42.348921]",308
