## Loading External Data from Files into Pandas

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame, Series

In [4]:
# read external csv into a dataframe
df = pd.read_csv('data_sources/ex1.csv', sep=',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
names = ['a1','b1','c1','d1','info']
df = pd.read_csv('data_sources/ex2.csv', sep=',', names=names, 
                 index_col='info')
df

Unnamed: 0_level_0,a1,b1,c1,d1
info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [14]:
# we can skip rows
df = pd.read_csv('data_sources/ex4.csv', sep=',', skiprows=[0,2,3])
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [19]:
# we can handle missing and NA values as we import
# here we replace 3.0 with NaN
df = pd.read_csv('data_sources/ex5.csv', na_values=3.0) 
df
# then fill NaN
# df.fillna(0)
pd.isnull(df) # we can find members which are NaN, Null, None etc

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,True,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


### Large Data Sets

In [31]:
df = pd.read_csv('data_sources/ex6.csv', nrows=40) 
df.head(20)
df.tail(20)
df.describe()
# df.min()
df

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [33]:
# we can read large data sets as chunks
chunks = pd.read_csv('data_sources/ex6.csv', chunksize=1000)
all_ = pd.Series([], dtype='float64') # an empty series
for piece in chunks:
    all_ = all_.add(piece['key'].value_counts(), fill_value=0) # fill missing values
all_    

0    151.0
1    146.0
2    152.0
3    162.0
4    171.0
5    157.0
6    166.0
7    164.0
8    162.0
9    150.0
A    320.0
B    302.0
C    286.0
D    320.0
E    368.0
F    335.0
G    308.0
H    330.0
I    327.0
J    337.0
K    334.0
L    346.0
M    338.0
N    306.0
O    343.0
P    324.0
Q    340.0
R    318.0
S    308.0
T    304.0
U    326.0
V    328.0
W    305.0
X    364.0
Y    314.0
Z    288.0
dtype: float64

### Reading Excel and writing data out again

In [36]:
x = pd.read_excel('data_sources/ex1.xlsx')
x.to_csv('data_sources/out.csv', sep='-')

### Working with JSON

In [39]:
import json
# json is text, not JavaScript


In [44]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
type(obj) # json is string data
r=json.loads(obj)
type(r) # we have a dict
s = pd.Series(r)
s
s.to_json('data_sources/wes.json')

### We can use API endpoints

In [47]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
response = requests.get(url) # we grab some API data
data = response.json() # we now have a structure - in this case a list of dict
data
# we can convert this to a DataFrame
issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,44041,Fixed metadata propagation in Dataframe.apply ...,[],open
1,44039,REF: dispatch DTI/TDI setops to RangeIndex,[],open
2,44038,TST: added groupby apply test for nan coerce,[],open
3,44037,DOC: Document and annotate Index.reindex (#403...,[],open
4,44035,TST: adds test for .loc on multiindex for seri...,"[{'id': 127685, 'node_id': 'MDU6TGFiZWwxMjc2OD...",open
5,44034,CLN: no need for suffices anymore in test_hash...,"[{'id': 127685, 'node_id': 'MDU6TGFiZWwxMjc2OD...",open
6,44033,Issue38947,"[{'id': 127685, 'node_id': 'MDU6TGFiZWwxMjc2OD...",open
7,44032,[PERF] fixing memory leak in aggregation.pyx,"[{'id': 8935311, 'node_id': 'MDU6TGFiZWw4OTM1M...",open
8,44031,"BUG: apply swallows exceptions, shows inconsis...","[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
9,44030,BUG: .fillna({}) doesn't work,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
