## Loading External Files

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# read external csv into a dataframe
df = pd.read_csv('data_sources/ex1.csv', sep=',') # sep is optional
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
names = ['a','b','c','d','message']
df = pd.read_csv('data_sources/ex2.csv', names=names, index_col='message', header=None)
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [29]:
df = pd.read_csv('data_sources/ex4.csv', skiprows=[0,2,3])
df = pd.read_csv('data_sources/ex5.csv', na_values=3.0) # what value should be made into NaN
pd.isnull(df)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


## Large Data Sets

In [None]:
df6 = pd.read_csv('data_sources/ex6.csv') #, nrows=40)
df6.describe()


In [40]:
df6

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [46]:
# sometimes it makes sense to 'chunk' our data
f6 = pd.read_csv('data_sources/ex6.csv', nrows=5)
df6.describe()
df6
# we may choose to read a chunk at a time
chunks = pd.read_csv('data_sources/ex6.csv', chunksize=1000)
all = pd.Series([], dtype='float64')
for piece in chunks:
    all = all.add(piece['key'].value_counts(), fill_value=0) # fill missing values
    
all

0    151.0
1    146.0
2    152.0
3    162.0
4    171.0
5    157.0
6    166.0
7    164.0
8    162.0
9    150.0
A    320.0
B    302.0
C    286.0
D    320.0
E    368.0
F    335.0
G    308.0
H    330.0
I    327.0
J    337.0
K    334.0
L    346.0
M    338.0
N    306.0
O    343.0
P    324.0
Q    340.0
R    318.0
S    308.0
T    304.0
U    326.0
V    328.0
W    305.0
X    364.0
Y    314.0
Z    288.0
dtype: float64

## Reading Excel and Writing Data out

In [48]:
excel = pd.read_excel('data_sources/ex1.xlsx')
excel

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [51]:
excel.to_csv('data_sources/output.csv', sep='-')

## Working with JSON

In [58]:
import json
# remember JSON is TEXT (not JavaScript)
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
type(obj)
r=json.loads(obj)
type(r)
# make a series
s = pd.Series(r)
s.to_json('data_sources/wes.json')

In [74]:
## Working with API end-points
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
response = requests.get(url)
data = response.json() # [7]['title'] # nb must use [] notation
data
issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
# we can make any column into an index
issues

Unnamed: 0,number,title,labels,state
0,42102,Indexing MultiIndex with duplicates return Ser...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
1,42101,BUG: `Styler.hide_columns()` does not hide ind...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
2,42100,REF: use only inherit_names in CategoricalIndex,[],open
3,42099,BUG: Adding Series to empty DataFrame can rese...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
4,42095,CLN: use fused type for lib.has_infs,"[{'id': 211029535, 'node_id': 'MDU6TGFiZWwyMTE...",open
5,42091,REGR: Subclassing `Styler` and addressing `fro...,[],open
6,42090,[WIP] BUG: fix `Series.argsort`,[],open
7,42087,"REGR: undocumented astype(""category"").astype(s...","[{'id': 78527356, 'node_id': 'MDU6TGFiZWw3ODUy...",open
8,42085,BUG: groupby any/all raising with pd.NA object...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
9,42084,PERF/CLN: to_csv,"[{'id': 47229171, 'node_id': 'MDU6TGFiZWw0NzIy...",open
