# This notebook goes through the scraping of geostationary satellite (GOES-16) data from UWI RealEarth

RealEarth API documentation: https://realearth.ssec.wisc.edu/doc/api.php

In [10]:
import ast #allows to deal with string as dictionary
import requests
import pandas as pd

## First build a list of available dataset timestamps for a given channel
In this case we are looking at the channel G16-ABI-CONUS-BAND02, which collects cloud data from a high-res full-continental US image collected by the GOES-east satellite

In [201]:
#initialize a list to contain timestamp values:



def available_datasets_by_time(channel):
    abi_band2_list = []
    base_url = "https://realearth.ssec.wisc.edu/api/times?products="
    #url = requests.get(base_url + str(channel))
    url = base_url + channel
    r = requests.get(url)
    list_text = ast.literal_eval(r.text)
    for val in list_text.values():
        if val not in abi_band2_list:
            abi_band2_list.append(val)
    return abi_band2_list

input_list = available_datasets_by_time("G16-ABI-CONUS-BAND02")    


## Next, subset this list to only contain the first timestamp for each hour

The reasoning behind this is that the current dataset of weather values from DarkskyAPI is hourly and we are looking for one image per weather instance

In [202]:
#look for unique combinations between date and hour stamps and use these to build a new list:

def unique_time_stamps(input_list):
    output_list = []
    for i in range(len(input_list[0])):
        date, hour = input_list[0][i][0:8], input_list[0][i][-6:-4]
        if date+hour not in output_list:
            output_list.append(date+hour)
            #output_list.append(date[0:4] + "-" + date[4:6] + "-" + date[6:8] + "+" + hour)
            
    return output_list     

unique_times = unique_time_stamps(input_list)




In [203]:
#reformat times:

def reformat_times(input_list):
    output_list = []
    for i in input_list:
        output_list.append(i[0:4] + "-" + i[4:6] + "-" + i[6:8] + "+" + i[8:10])
    return output_list

image_times = reformat_times(unique_times)


In [226]:
#generate list of urls:

def url_list(list_of_times):
    url_output_list = []
    base_url = "https://realearth.ssec.wisc.edu/api/image?products=G16-ABI-CONUS-BAND02&time="
    end_url = "&center=37.77,-122.41&zoom=9&width=150&height=150"
    for i in list_of_times:
        url_output_list.append(base_url + i + end_url)
    return url_output_list

input_urls = url_list(image_times)

## Pull images for each url:

Images are pulled as pngs, then saved as 1D arrays (which can be used as input for ML and re-viewed via reshaping).


In [206]:
from PIL import Image
import requests
from io import BytesIO
import numpy as np

In [362]:
#take input url, retrieve png, and flatten to 1D array (C-style):

def flatten_png_from_url(input_url):
    response = requests.get(input_url)
    if response.status_code == 200:        
        input_png = Image.open(BytesIO(response.content))  
        as_array = np.array(input_png)[:,:,0]
        flat_array = as_array.flatten()
        return flat_array




In [373]:
#build images dataframe with urls and timestamps

images_df = pd.DataFrame(columns=["url", "img_array"])
images_df["url"] = [url_list(time) for time in image_times]
images_df["img_array"] = [flatten_png_from_url(url) for url in input_urls]
images_df["time_stamps"] = [time for time in unique_times]

## Save resulting dataframe. This will be updated periodically to build dataset.

Ideally, the above functions should be built into a pipeline that opens the images_df, appends new data, then re-saves. This should be integrated with the DarkSkyAPI scripts to build a single df with arrays, timestamps, and temp values etc.

In [224]:
#Display png from array example:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = flattened_png
some_digit_image = some_digit.reshape(150,150)
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")


array([ 81,  81,  81, ...,  58,  58, 255], dtype=uint8)