In [1]:
import pandas as pd
import requests

In [2]:
# use items from lesson as our blueprint:

In [3]:
# initialize an empty list
items_list = []
# get an initial response
response = requests.get('https://python.zgulde.net/api/v1/items')
data = response.json()
# get our max page to establish our iterable
n = data['payload']['max_page']

In [6]:
# look at n:
n

3

In [7]:
base_url = 'https://python.zgulde.net/api/v1/items?page='

In [8]:
# define a for loop in a range going from 1 to n 
for i in range(1,n+1):
    # get full url, adding a string version of the page number
    url = base_url + str(i)
    # get our response
    response = requests.get(url)
    # turn our response into python dictionary
    data = response.json()
    # grab just the content out
    page_items = data['payload']['items']
    # add in this page of items to our items list
    items_list += page_items

In [10]:
# check out items_list
items_list[0]

{'item_brand': 'Riceland',
 'item_id': 1,
 'item_name': 'Riceland American Jazmine Rice',
 'item_price': 0.84,
 'item_upc12': '35200264013',
 'item_upc14': '35200264013'}

In [11]:
# turn this into a dataframe
items = pd.DataFrame(items_list)

In [12]:
items.head()

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036


In [13]:
# 
# follow this procedure for stores and then sales
# 
# 

#### get stores next:

In [14]:
# set up a base url string
base_url = 'https://python.zgulde.net'
# tack on an endpoint
response = requests.get(base_url + '/api/v1/stores')

In [19]:
response.json()['payload'].keys()

dict_keys(['max_page', 'next_page', 'page', 'previous_page', 'stores'])

In [20]:
response.json()['payload']['max_page']

1

In [21]:
stores = pd.DataFrame(response.json()['payload']['stores'])

In [22]:
stores.head()

Unnamed: 0,store_address,store_city,store_id,store_state,store_zipcode
0,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
1,9255 FM 471 West,San Antonio,2,TX,78251
2,2118 Fredericksburg Rdj,San Antonio,3,TX,78201
3,516 S Flores St,San Antonio,4,TX,78204
4,1520 Austin Hwy,San Antonio,5,TX,78218


#### and now let's go for sales

In [24]:
base_url

'https://python.zgulde.net'

In [25]:
# get that initial response for sales data
response = requests.get(base_url + '/api/v1/sales')

In [26]:
end_point = response.json()['payload']['next_page']

In [28]:
# where does this end?
response.json()['payload']['max_page']

183

In [27]:
end_point

'/api/v1/sales?page=2'

In [29]:
end_point[:-1]

'/api/v1/sales?page='

In [30]:
end_point_last_page = end_point[:-1]+ str(response.json()['payload']['max_page'])

In [31]:
end_point_last_page

'/api/v1/sales?page=183'

In [32]:
response.json()['payload']['max_page']

183

In [33]:
# let's investigate the next page key on the last page of the data:
last_page = requests.get(base_url + end_point_last_page).json()['payload']['next_page']

In [37]:
last_page

In [38]:
# cast it as a boolean
bool(last_page)

False

In [40]:
# assign pages to the initial sales page
pages = requests.get(base_url + '/api/v1/sales?page=1').json()['payload']['sales']

In [39]:
# assign endpoint to the next_page key
endpoint = requests.get(base_url + '/api/v1/sales?page=1').json()['payload']['next_page']

In [41]:
# while version

In [42]:
# endpoint is our next page key, which we will reassign in a loop
endpoint

'/api/v1/sales?page=2'

In [43]:
while endpoint:
    # get our response:
    response = requests.get(base_url + endpoint).json()['payload']
    # change our endpoint in the loop
    endpoint = response['next_page']
    # grab the content and ass it to pages
    pages += response['sales']

In [44]:
len(pages)

913000

In [None]:
# for loop version:

In [None]:
pages = []
for i in range(1,184):
    response = requests.get(base_url + '/api/v1/sales?page=' + str(i))
    sales = response.json()['payload']['sales']
    pages += sales

In [None]:
# turn it into a dataframe

In [45]:
sales = pd.DataFrame(pages)

In [47]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   item         913000 non-null  int64  
 1   sale_amount  913000 non-null  float64
 2   sale_date    913000 non-null  object 
 3   sale_id      913000 non-null  int64  
 4   store        913000 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 34.8+ MB


In [48]:
# sales = pd.read_csv('sales_cached.csv', index_col=0)

#### see how we are going to merge everything -- establish our keys

In [49]:
sales.columns

Index(['item', 'sale_amount', 'sale_date', 'sale_id', 'store'], dtype='object')

In [52]:
sales.head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1


In [53]:
items.head()

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036


In [50]:
items.columns

Index(['item_brand', 'item_id', 'item_name', 'item_price', 'item_upc12',
       'item_upc14'],
      dtype='object')

In [54]:
stores.head()

Unnamed: 0,store_address,store_city,store_id,store_state,store_zipcode
0,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
1,9255 FM 471 West,San Antonio,2,TX,78251
2,2118 Fredericksburg Rdj,San Antonio,3,TX,78201
3,516 S Flores St,San Antonio,4,TX,78204
4,1520 Austin Hwy,San Antonio,5,TX,78218


In [51]:
stores.columns

Index(['store_address', 'store_city', 'store_id', 'store_state',
       'store_zipcode'],
      dtype='object')

In [None]:
# first merge: sales to stores

In [55]:
sales_plus_stores = pd.merge(
    sales,
    stores,
    how='left',
    left_on='store',
    right_on='store_id')

In [58]:
sales.shape

(913000, 5)

In [57]:
sales_plus_stores.shape

(913000, 10)

In [59]:
sales_plus_stores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 913000 entries, 0 to 912999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   item           913000 non-null  int64  
 1   sale_amount    913000 non-null  float64
 2   sale_date      913000 non-null  object 
 3   sale_id        913000 non-null  int64  
 4   store          913000 non-null  int64  
 5   store_address  913000 non-null  object 
 6   store_city     913000 non-null  object 
 7   store_id       913000 non-null  int64  
 8   store_state    913000 non-null  object 
 9   store_zipcode  913000 non-null  object 
dtypes: float64(1), int64(4), object(5)
memory usage: 76.6+ MB


In [None]:
# second merge

In [60]:
errything = pd.merge(
    sales_plus_stores,
    items,
    how='left',
    left_on='item',
    right_on='item_id')

In [62]:
errything.head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store,store_address,store_city,store_id,store_state,store_zipcode,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013


In [None]:
# now put that in a script!

In [None]:
errything.to_csv('cached_everything.csv')

In [None]:
# pseudo-script:
# import os

# if os.path.isfile('cached_everything.csv'):
#     df = pd.read_csv('cached_everything.csv', index_col=0)
# else:
#     do all the requests and merges

In [None]:
# let's get this other data!

In [63]:
df = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')

In [64]:
df.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4383 entries, 0 to 4382
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         4383 non-null   object 
 1   Consumption  4383 non-null   float64
 2   Wind         2920 non-null   float64
 3   Solar        2188 non-null   float64
 4   Wind+Solar   2187 non-null   float64
dtypes: float64(4), object(1)
memory usage: 171.3+ KB
