## Data loading , storage and file formats

In [1]:
# Importing libraries 

import numpy as np
import pandas as pd

In [2]:
# Reading CSV file
pd.read_csv('example-1.csv') # auto header is detected.

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
pd.read_csv('example-2.csv', header=None) # No header is available

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
pd.read_csv('example-2.csv',names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
names=['a','b','c','d','message']

In [9]:
pd.read_csv('example-2.csv', names=names, index_col="message")

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [10]:
pd.read_csv('example-3.csv', index_col=['key1','key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [11]:
pd.read_csv('example-3.csv')

Unnamed: 0,key1,key2,value1,value2
0,one,a,1,2
1,one,b,3,4
2,one,c,5,6
3,one,d,7,8
4,two,a,9,10
5,two,b,11,12
6,two,c,13,14
7,two,d,15,16


In [12]:
# reading txt file with space as separater

pd.read_csv('example-4.txt', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [14]:
# skiping rows while reading data
pd.read_csv('example-5.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [15]:
# NA is data
result = pd.read_csv('example-6.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [16]:
pd.isna(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [17]:
result = pd.read_csv('example-6.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [18]:
result2 = pd.read_csv('example-6.csv', keep_default_na=False)
result2

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [19]:
result2.isna()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False


In [20]:
# reading files in chunks

pd.read_csv('example-7.csv', nrows=10)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [21]:
chunker = pd.read_csv('example-7.csv', chunksize=100)
chunker

<pandas.io.parsers.readers.TextFileReader at 0x24da9b98a60>

In [22]:
total = pd.Series([])
for chunk in chunker:
    total = total.add(chunk["key"].value_counts(), fill_value =0)


total.sort_values(ascending=False)


key
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
V    328.0
I    327.0
U    326.0
P    324.0
D    320.0
A    320.0
R    318.0
Y    314.0
G    308.0
S    308.0
N    306.0
W    305.0
T    304.0
B    302.0
Z    288.0
C    286.0
4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: object

## Writing data to text format

In [23]:
data = pd.read_csv('example-6.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [24]:
data.to_csv('out.csv')

In [27]:
import sys
data.to_csv(sys.stdout, sep="|")

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [29]:
data.to_csv(sys.stdout, sep="|", na_rep="XX")

|something|a|b|c|d|message
0|one|1|2|3.0|4|XX
1|two|5|6|XX|8|world
2|three|9|10|11.0|12|foo


In [30]:
import csv
with open('example-8.csv')as f:
    lines = list(csv.reader(f))

In [31]:
header, values = lines[0], lines[1:]

In [32]:
header

['a', 'b', 'c']

In [33]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [35]:
{k: v for k, v in zip(header, zip(*values))}

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

## JSON data

In [36]:
obj = """
{"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
 {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
"""
obj

'\n{"name": "Wes",\n "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],\n "pet": null,\n "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},\n {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]\n}\n'

In [37]:
import json

# to load json data 
# All keys are string in json format. 

result = json.loads(obj)
result

{'name': 'Wes',
 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
  {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}

In [38]:
# To convert data back to json format 
asjson = json.dumps(result)
asjson

'{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}'

In [39]:
siblings = pd.DataFrame(result['siblings'], columns=['name','age'])
siblings

Unnamed: 0,name,age
0,Scott,34
1,Katie,42


In [40]:
siblings = pd.DataFrame(result['siblings'])
siblings

Unnamed: 0,name,age,hobbies
0,Scott,34,"[guitars, soccer]"
1,Katie,42,"[diving, art]"


In [41]:
pd.read_json('example.json')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


## XML AND HTML tags

In [42]:
#pip install lxml beautifulsoup4 html5lib

In [43]:
table = pd.read_html('bank.html')
len(table)

1

In [44]:
failure = table[0]

In [45]:
failure.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [46]:
failure

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"
...,...,...,...,...,...,...,...
542,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB","July 27, 2001","August 19, 2014"
543,Malta National Bank,Malta,OH,6629,North Valley Bank,"May 3, 2001","November 18, 2002"
544,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,"February 2, 2001","February 18, 2003"
545,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,"December 14, 2000","March 17, 2005"


In [47]:
close_timestamp = pd.to_datetime(failure['Closing Date'])
close_timestamp.dt.year.value_counts()

Closing Date
2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2004      4
2001      4
2007      3
2003      3
2000      2
Name: count, dtype: int64

In [48]:
from lxml import objectify

with open('xml_example.xml')as f:
    parsed = objectify.parse(f)
root = parsed.getroot()

In [49]:
data = []

skipped_field = ["PARENT_SEQ", "INDICATOR_SEQ","DESIRED_CHANGE", "DECIMAL_PLACES"]

for value in root.INDICATOR:
    el_data = {}
    for child in value.getchildren():
        if value in skipped_field:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
data

[{'INDICATOR_SEQ': 373889,
  'PARENT_SEQ': '',
  'AGENCY_NAME': 'Metro-North Railroad',
  'INDICATOR_NAME': 'Escalator Availability',
  'DESCRIPTION': 'Percent of the time that escalators are operational\n        systemwide. The availability rate is based on physical observations performed\n        the morning of regular business days only. This is a new indicator the agency\n        began reporting in 2009.',
  'PERIOD_YEAR': 2011,
  'PERIOD_MONTH': 12,
  'CATEGORY': 'Service Indicators',
  'FREQUENCY': 'M',
  'DESIRED_CHANGE': 'U',
  'INDICATOR_UNIT': '%',
  'DECIMAL_PLACES': 1,
  'YTD_TARGET': 97.0,
  'YTD_ACTUAL': '',
  'MONTHLY_TARGET': 97.0,
  'MONTHLY_ACTUAL': ''}]

In [50]:
perf = pd.DataFrame(data)
perf

Unnamed: 0,INDICATOR_SEQ,PARENT_SEQ,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,DESIRED_CHANGE,INDICATOR_UNIT,DECIMAL_PLACES,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,373889,,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,U,%,1,97.0,,97.0,


## Reading MS Excel 

In [None]:
#pip install openpyxl

In [51]:
xlsx = pd.ExcelFile('example-9.xlsx')

In [52]:
xlsx.sheet_names

['Sheet1', 'Sheet2', 'Sheet3']

In [53]:
xlsx.parse(sheet_name='Sheet1')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [54]:
pd.read_excel('example-9.xlsx')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Intereacting with web API

In [56]:
#pip install requests

In [57]:
import requests
url = "https://api.github.com/repos/pandas-dev/pandas/issues"

In [58]:
resp = requests.get(url)

In [59]:
resp.raise_for_status()

In [60]:
resp

<Response [200]>

In [61]:
data = resp.json()
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/53602',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/53602/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/53602/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/53602/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/53602',
  'id': 1751427482,
  'node_id': 'PR_kwDOAA0YD85StBv0',
  'number': 53602,
  'title': 'ENH: Series.explode to support pyarrow-backed list types',
  'user': {'login': 'lukemanley',
   'id': 8519523,
   'node_id': 'MDQ6VXNlcjg1MTk1MjM=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/8519523?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/lukemanley',
   'html_url': 'https://github.com/lukemanley',
   'followers_url': 'https://api.github.com/users/lukemanley/followers',
   'following_url': 

In [64]:
data[0]['title']

'ENH: Series.explode to support pyarrow-backed list types'

In [65]:
issues = pd.DataFrame(data)
issues

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,draft,pull_request,body,reactions,timeline_url,performed_via_github_app,state_reason
0,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53602,1751427482,PR_kwDOAA0YD85StBv0,53602,ENH: Series.explode to support pyarrow-backed ...,...,,CONTRIBUTOR,,False,{'url': 'https://api.github.com/repos/pandas-d...,- [x] [Tests added and passed](https://pandas....,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
1,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53601,1751393343,PR_kwDOAA0YD85Ss6u9,53601,"fix Series.apply(..., by_row), v2.",...,,CONTRIBUTOR,,False,{'url': 'https://api.github.com/repos/pandas-d...,Fixes https://github.com/pandas-dev/pandas/pul...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
2,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/issues/53600,1751323038,I_kwDOAA0YD85oYxGe,53600,BUG: Am I screwing up ?? [OVERRIDING APPENDING],...,,NONE,,,,### Pandas version checks\r\n\r\n- [X] I have ...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
3,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/issues/53599,1751256168,I_kwDOAA0YD85oYgxo,53599,BUG: on_bad_lines=callable does not invoke cal...,...,,NONE,,,,### Pandas version checks\r\n\r\n- [X] I have ...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
4,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53598,1751249423,PR_kwDOAA0YD85Sscsv,53598,Corrected minor typos in the paragraph below '...,...,,NONE,,False,{'url': 'https://api.github.com/repos/pandas-d...,- [ ] closes #53596 \r\n- [ ] All [code checks...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
5,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53597,1751230901,PR_kwDOAA0YD85SsYoA,53597,Command for installation in notebook,...,,NONE,,False,{'url': 'https://api.github.com/repos/pandas-d...,- [ ] closes #xxxx (Replace xxxx with the GitH...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
6,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/issues/53596,1751214848,I_kwDOAA0YD85oYWsA,53596,DOC: Minor typo and punctuation error,...,,NONE,,,,### Pandas version checks\n\n- [X] I have chec...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
7,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/issues/53595,1751042524,I_kwDOAA0YD85oXsnc,53595,ENH: Allowing plotting for non-numeric and non...,...,,CONTRIBUTOR,,,,### Feature Type\r\n\r\n- [ ] Adding new funct...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
8,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53594,1751028319,PR_kwDOAA0YD85SrvVJ,53594,"ENH: Implement PandasArray, DTA, TDA interpolate",...,,MEMBER,,False,{'url': 'https://api.github.com/repos/pandas-d...,pushing towards #50950,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,
9,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas...,https://github.com/pandas-dev/pandas/pull/53593,1750999126,PR_kwDOAA0YD85Srplu,53593,CI: Fix the deprecation bot,...,,MEMBER,,False,{'url': 'https://api.github.com/repos/pandas-d...,- [ ] closes #xxxx (Replace xxxx with the GitH...,{'url': 'https://api.github.com/repos/pandas-d...,https://api.github.com/repos/pandas-dev/pandas...,,


## Interacting with Databases

In [66]:
import sqlite3

In [70]:
query = """
CREATE TABLE test1(
a VARCHAR(20), 
b VARCHAR(20), 
c REAL,
d INTEGER);
"""

In [71]:
con = sqlite3.connect("mydata.sqllite")

In [72]:
con.execute(query)

<sqlite3.Cursor at 0x24da7806440>

In [73]:
con.commit()

In [83]:
data = [("Atlanta", "Georgia", 1.25, 6),
 .....: ("Tallahassee", "Florida", 2.6, 3),
 .....: ("Sacramento", "California", 1.7, 5)]
data

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [77]:
stmt = 'INSERT INTO test1 VALUES(?,?,?,?)'

In [78]:
con.executemany(stmt, data)

<sqlite3.Cursor at 0x24dac9d1740>

In [79]:
con.commit()

In [80]:
# Read data from database

cursor = con.execute('SELECT * FROM test1')

In [81]:
rows = cursor.fetchall()

In [82]:
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [84]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [85]:
pd.DataFrame(rows, columns=[value[0] for value in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
