In [1]:
# Importing libraries:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Importing the os library specifically for reading the csv once I've created the file in my working directory.
import os

# web-based requests
import requests

In [2]:
base_url = 'https://python.zach.lol'
print(requests.get(base_url).text)

{"api":"/api/v1","help":"/documentation"}



In [3]:
response = requests.get(base_url + '/documentation')
print(response.json()['payload'])


The API accepts GET requests for all endpoints, where endpoints are prefixed
with

    /api/{version}

Where version is "v1"

Valid endpoints:

- /stores[/{store_id}]
- /items[/{item_id}]
- /sales[/{sale_id}]

All endpoints accept a `page` parameter that can be used to navigate through
the results.



In [4]:
# Now I have the info I need to start getting specific portions of the information via the API.

## Items Data

### 1. Using the code from the lesson as a guide, create a dataframe named items that has all of the data for items.

In [5]:
response = requests.get('https://python.zach.lol/api/v1/items')

data = response.json()
data.keys()

dict_keys(['payload', 'status'])

In [None]:
data['payload'].keys()

In [None]:
print('max_page: %s' % data['payload']['max_page'])
print('next_page: %s' % data['payload']['next_page'])

In [None]:
data['payload']['items'][:2]

In [None]:
# So I need to go back and recall how to work with dictionaries. I need to use those techniques to parse the info, instead of trying to use API to prep the data.

In [None]:
# data from page 1

items = pd.DataFrame(data['payload']['items'])
print(items.shape)
items.head()

In [None]:
data['payload']['items']

In [None]:
response = requests.get(base_url + data['payload']['next_page'])
data = response.json()

print('max_page: %s' % data['payload']['max_page'])
print('next_page: %s' % data['payload']['next_page'])

items = pd.concat([items, pd.DataFrame(data['payload']['items'])]).reset_index()

In [None]:
response = requests.get(base_url + data['payload']['next_page'])
data = response.json()

print('max_page: %s' % data['payload']['max_page'])
print('next_page: %s' % data['payload']['next_page'])

items = pd.concat([items, pd.DataFrame(data['payload']['items'])]).reset_index()

In [None]:
items.drop(columns = ['level_0', 'index'], inplace = True)
items.shape

In [None]:
items.head()

In [None]:
items_df = pd.DataFrame(data['payload']['items'])
print(items_df.shape)
items_df.head()

In [None]:
# I want to try creating an if loop for this one:

In [None]:
data['payload']['max_page']

In [None]:
response_test = requests.get('https://python.zach.lol/api/v1/items')

In [None]:
data_test = response.json()

In [None]:
response_test = requests.get(base_url + data_test['payload']['next_page'])
data_test = response_test.json()
print(data_test['payload']['next_page'])

In [None]:
response_test = requests.get(base_url + data_test['payload']['next_page'])
data_test = response_test.json()
print(data_test['payload']['next_page'])

In [None]:
data['payload']['max_page']

In [None]:
# Creating the function:

def get_items_new(base_url):
    '''
    This function is designed to get the items data from Zach's web service and turn that data into a pandas
    dataframe for use.
    '''
    
    # initialize:
    
    response = requests.get('https://python.zach.lol/api/v1/items')
    data = response.json()
    df = pd.DataFrame(data['payload']['items'])
    
    
    
    for x in range(0, data['payload']['max_page']):
        response = requests.get(base_url + data['payload']['next_page'])
        data = response.json()
        df = pd.concat([df, pd.DataFrame(data['payload']['items'])], ignore_index = True)
        if data['payload']['next_page'] == None:
            return df
    df = df.reset_index()
    return df

In [None]:
base_url = 'https://python.zach.lol'

items_df_testing = get_items_new(base_url)

In [None]:
items.shape, items_df_testing.shape

In [None]:
items_df_testing.head()

In [None]:
base_url + '/api/v1/stores'

## Store Data

### 2. Do the same thing, but for stores.



In [12]:
response_stores = requests.get('https://python.zach.lol/api/v1/stores')

data_stores = response_stores.json()
data_stores.keys()

dict_keys(['payload', 'status'])

In [13]:
data_stores['payload'].keys()

dict_keys(['max_page', 'next_page', 'page', 'previous_page', 'stores'])

In [14]:
print('max_page: %s' % data_stores['payload']['max_page'])
print('next_page: %s' % data_stores['payload']['next_page'])

max_page: 1
next_page: None


In [15]:
data_stores['payload'].keys()
data_stores['payload']['stores'][:2]

[{'store_address': '12125 Alamo Ranch Pkwy',
  'store_city': 'San Antonio',
  'store_id': 1,
  'store_state': 'TX',
  'store_zipcode': '78253'},
 {'store_address': '9255 FM 471 West',
  'store_city': 'San Antonio',
  'store_id': 2,
  'store_state': 'TX',
  'store_zipcode': '78251'}]

In [16]:
stores_df = pd.DataFrame(data_stores['payload']['stores'])
print(stores_df.shape)
stores_df

(10, 5)


Unnamed: 0,store_address,store_city,store_id,store_state,store_zipcode
0,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
1,9255 FM 471 West,San Antonio,2,TX,78251
2,2118 Fredericksburg Rdj,San Antonio,3,TX,78201
3,516 S Flores St,San Antonio,4,TX,78204
4,1520 Austin Hwy,San Antonio,5,TX,78218
5,1015 S WW White Rd,San Antonio,6,TX,78220
6,12018 Perrin Beitel Rd,San Antonio,7,TX,78217
7,15000 San Pedro Ave,San Antonio,8,TX,78232
8,735 SW Military Dr,San Antonio,9,TX,78221
9,8503 NW Military Hwy,San Antonio,10,TX,78231


In [17]:
# Only one page of store data, so I don't need to concatinate multiple pages here.

In [18]:
# Creating the function:

def get_stores_new(base_url):
    '''
    This function is designed to get the items data from Zach's web service and turn that data into a pandas
    dataframe for use.
    '''
    
    # initialize:
    
    response = requests.get('https://python.zach.lol/api/v1/stores')
    data = response.json()
    df = pd.DataFrame(data['payload']['stores'])
    
    
    if data['payload']['next_page'] == None:
        return df
    else:
        for x in range(0, data['payload']['max_page']):
            response = requests.get(base_url + data['payload']['next_page'])
            data = response.json()
            df = pd.concat([df, pd.DataFrame(data['payload']['stores'])], ignore_index = True)
        return df
    df = df.reset_index()
    return df

In [19]:
stores_df_test = get_stores_new(base_url)
stores_df.shape, stores_df_test.shape

((10, 5), (10, 5))

## Sales Data

### 3. Extract the data for sales. 

- There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.

In [None]:
base_url_sales = 'https://python.zach.lol'

In [None]:
# Create a function from what I'm doing here. While loop is based on next_page != "None"

response_sales = requests.get('https://python.zach.lol/api/v1/sales')

data_sales = response_sales.json()
data_sales.keys()

In [None]:
data_sales['payload'].keys()

In [None]:
print('max_page: %s' % data_sales['payload']['max_page'])
print('next_page: %s' % data_sales['payload']['next_page'])

In [None]:
# This code is looking at a set number of entries in the dictionary I'm calling from the api:
data_sales['payload']['sales'][:2]

In [None]:
sales_df = pd.DataFrame(data_sales['payload']['sales'])
sales_df.head()

In [None]:
sales_df.shape

In [None]:
response_sales = requests.get(base_url + data_sales['payload']['next_page'])
data_sales = response_sales.json()

print('max_page: %s' % data_sales['payload']['max_page'])
print('next_page: %s' % data_sales['payload']['next_page'])

sales_df = pd.concat([sales_df, pd.DataFrame(data_sales['payload']['sales'])]).reset_index()
sales_df.shape

In [None]:
# Calling the same thing, but now I'm calling as many items are on the first page:
len(data_sales['payload']['sales'])
# There are 5000 items per page it seems. So the number of rows I'll have is:

row_total_guess = len(data_sales['payload']['sales']) * (data_sales['payload']['max_page'])
print(f'The estimated total number of rows of the combined sales dataframe is {row_total_guess:,}.')

In [None]:
def get_sales(base_url):
    
    response = requests.get('https://python.zach.lol/api/v1/sales')
    data = response.json()
    data.keys()
    print('max_page: %s' % data['payload']['max_page'])
    print('next_page: %s' % data['payload']['next_page'])
    
    df_sales = pd.DataFrame(data['payload']['sales'])
    
    while data['payload']['next_page'] != "None":
        response = requests.get(base_url + data['payload']['next_page'])
        data = response.json()
        print('max_page: %s' % data['payload']['max_page'])
        print('next_page: %s' % data['payload']['next_page'])
        
        
        df_sales = pd.concat([df_sales, pd.DataFrame(data['payload']['sales'])])
        
        if data['payload']['next_page'] == None:
            break
            
    df_sales = df_sales.reset_index()
    print('full_shape', df_sales.shape)
    return df_sales
    
    

In [None]:
sales_df = get_sales(base_url_sales)

In [None]:
sales_df.shape

In [None]:
sales_df.drop(columns = 'index', inplace = True)

In [None]:
sales_df.head()

### 4. Save the data in your files to local csv files so that it will be faster to access in the future.

In [None]:
# Writing to a csv:

def write_csv(df, csv_name):
    '''
    The first argument (df) is the dataframe you want written to a .csv file. 
    The second argument (csv_name) must be a string, including the .csv extention. eg: 'example_df.csv'
    '''
    
    df.to_csv(csv_name, index = False)
    print('Completed writing df to .csv file')
    

In [None]:
write_csv(sales_df, 'sales_df.csv')

In [None]:
write_csv(stores_df, 'stores_df.csv')

In [None]:
write_csv(items, 'items_df.csv')

### 5. Combine the data from your three separate dataframes into one large dataframe.

In [None]:
print(sales_df.shape)
sales_df.head()

In [None]:
print(stores_df.shape)
stores_df.head()

In [None]:
print(items.shape)
items.head()

In [None]:
# so, sales_df has both of the ids needed to join the two tables. I'll need to use two joins, both of them (I believe) will be a left join.

sales_test = sales_df.copy()

In [None]:
item_test = items.copy()
item_test.info()

In [None]:
items.rename(columns = {'item_id': 'item'}, inplace = True)
items.info()

In [None]:
stores_df.rename(columns = {'store_id': 'store'}, inplace = True)
stores_df.head()

In [None]:
items.head()

In [None]:
sales_df.head()

In [None]:
left_merge = pd.merge(sales_df, items, how = 'left', on = 'item')
left_merge.head()

In [None]:
left_merge.shape

In [None]:
all_df = pd.merge(left_merge, stores_df, how = 'left', on = 'store')
all_df.head()

In [None]:
all_df.shape

In [None]:
# Saving to csv:

write_csv(all_df, 'add_df.csv')

## Open Power System Data

### 6. Acquire the Open Power Systems Data for Germany.

- Which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. 

In [None]:
power_url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
power_df = pd.read_csv(power_url, ',')
power_df.head()

In [None]:
# A little bit of prep work:

power_df.rename(columns = {"Date": 'date', "Consumption": "consumption", "Wind": "wind", "Solar": "solar", "Wind+Solar": "wind_solar"}, inplace = True)
power_df.columns

In [None]:
# Now the function:

def get_germany_power(power_url):
    df = pd.DataFrame()
    df = pd.read_csv(power_url, ',')
    
    # now the cleaning:
    df.rename(columns = {"Date": 'date', "Consumption": "consumption", "Wind": "wind", "Solar": "solar", "Wind+Solar": "wind_solar"}, inplace = True)
    
    return df    

In [None]:
# testing the function..

power_df_test = get_germany_power(power_url)
power_df_test.head()

In [None]:
# writing to a csv:

write_csv(power_df_test, 'power_df.csv')

## Functions for Reproducibility

### 7. Make sure all the work that you have done above is reproducible. 
- That is, you should put the code above into separate functions in the acquire.py file and be able to re-run the functions and get the same data.

In [6]:
# Creating the items function:

def get_items_data():
    '''
    This function is designed to get the items data from Zach's web service and turn that data into a pandas
    dataframe for use.
    '''
    base_url = 'https://python.zach.lol'
    
    # initialize:
    
    response = requests.get('https://python.zach.lol/api/v1/items')
    data = response.json()
    df = pd.DataFrame(data['payload']['items'])
    
    
    
    for x in range(0, data['payload']['max_page']):
        response = requests.get(base_url + data['payload']['next_page'])
        data = response.json()
        df = pd.concat([df, pd.DataFrame(data['payload']['items'])], ignore_index = True)
        if data['payload']['next_page'] == None:
            return df
    df = df.reset_index()
    return df

In [7]:
# stores function:

def get_stores_list():
    '''
    This function is designed to get the items data from Zach's web service and turn that data into a pandas
    dataframe for use.
    '''
    
    base_url = 'https://python.zach.lol'
    
    # initialize:
    
    response = requests.get('https://python.zach.lol/api/v1/stores')
    data = response.json()
    df = pd.DataFrame(data['payload']['stores'])
    
    
    if data['payload']['next_page'] == None:
        return df
    else:
        for x in range(0, data['payload']['max_page']):
            response = requests.get(base_url + data['payload']['next_page'])
            data = response.json()
            df = pd.concat([df, pd.DataFrame(data['payload']['stores'])], ignore_index = True)
        return df
    df = df.reset_index()
    return df

In [8]:
# Sales function:
# Thanks to Ryvyn and Corey for help!

def get_sales_data():
    
    base_url = 'https://python.zach.lol'
    
    response = requests.get('https://python.zach.lol/api/v1/sales')
    data = response.json()
    data.keys()
    print('max_page: %s' % data['payload']['max_page'])
    print('next_page: %s' % data['payload']['next_page'])
    
    df_sales = pd.DataFrame(data['payload']['sales'])
    
    while data['payload']['next_page'] != "None":
        response = requests.get(base_url + data['payload']['next_page'])
        data = response.json()
        print('max_page: %s' % data['payload']['max_page'])
        print('next_page: %s' % data['payload']['next_page'])
        
        
        df_sales = pd.concat([df_sales, pd.DataFrame(data['payload']['sales'])])
        
        if data['payload']['next_page'] == None:
            break
            
    df_sales = df_sales.reset_index()
    print('full_shape', df_sales.shape)
    return df_sales
    

In [20]:
# Combining it all together:

def get_store_data():
    '''
    This function will pull all the store, item and sales data from Zach's web service pages.
    This function should be the basis of where to start the prep phase.
    '''
    
    base_url = 'https://python.zach.lol'
    
    # Calling the dataframes. I need to put in a cache = True argument somewhere so it doesn't always have to be 
    # pulling from Zach's web service. I think I can put that in here but I don't recall how that works.
    
    item_list = get_items_data()
    print(item_list.shape)
    store_list = get_stores_list()
    print(store_list.shape)
    sales_list = get_sales_data()
    print(sales_list.shape)
    
    # renaming columns:
    item_list.rename(columns = {'item_id': 'item'}, inplace = True)
    store_list.rename(columns = {'store_id': 'store'}, inplace = True)
    
    
    # Merging the three dataframes:
    left_merge = pd.merge(sales_list, item_list, how = 'left', on = 'item')
    all_df = pd.merge(left_merge, store_list, how = 'left', on = 'store')
    
    return all_df

In [21]:
# Getting power data function:

def get_germany_power():
    
    power_url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
    
    df = pd.DataFrame()
    df = pd.read_csv(power_url, ',')
    
    # now the cleaning:
    df.rename(columns = {"Date": 'date', "Consumption": "consumption", "Wind": "wind", "Solar": "solar", "Wind+Solar": "wind_solar"}, inplace = True)
    
    return df 

In [22]:
df_all_test = get_store_data()
df_all_test.shape

(50, 6)
(10, 5)
max_page: 183
next_page: /api/v1/sales?page=2
max_page: 183
next_page: /api/v1/sales?page=3
max_page: 183
next_page: /api/v1/sales?page=4
max_page: 183
next_page: /api/v1/sales?page=5
max_page: 183
next_page: /api/v1/sales?page=6
max_page: 183
next_page: /api/v1/sales?page=7
max_page: 183
next_page: /api/v1/sales?page=8
max_page: 183
next_page: /api/v1/sales?page=9
max_page: 183
next_page: /api/v1/sales?page=10
max_page: 183
next_page: /api/v1/sales?page=11
max_page: 183
next_page: /api/v1/sales?page=12
max_page: 183
next_page: /api/v1/sales?page=13
max_page: 183
next_page: /api/v1/sales?page=14
max_page: 183
next_page: /api/v1/sales?page=15
max_page: 183
next_page: /api/v1/sales?page=16
max_page: 183
next_page: /api/v1/sales?page=17
max_page: 183
next_page: /api/v1/sales?page=18
max_page: 183
next_page: /api/v1/sales?page=19
max_page: 183
next_page: /api/v1/sales?page=20
max_page: 183
next_page: /api/v1/sales?page=21
max_page: 183
next_page: /api/v1/sales?page=22
max_p

max_page: 183
next_page: /api/v1/sales?page=175
max_page: 183
next_page: /api/v1/sales?page=176
max_page: 183
next_page: /api/v1/sales?page=177
max_page: 183
next_page: /api/v1/sales?page=178
max_page: 183
next_page: /api/v1/sales?page=179
max_page: 183
next_page: /api/v1/sales?page=180
max_page: 183
next_page: /api/v1/sales?page=181
max_page: 183
next_page: /api/v1/sales?page=182
max_page: 183
next_page: /api/v1/sales?page=183
max_page: 183
next_page: None
full_shape (913000, 6)
(913000, 6)


(913000, 15)