# This notebook goes through the scraping of geostationary satellite (GOES-16) data from UWI RealEarth

RealEarth API documentation: https://realearth.ssec.wisc.edu/doc/api.php

In [694]:
import ast #allows to deal with string as dictionary
import requests
import pandas as pd
import time
import os

## First build a list of available dataset timestamps for a given channel
In this case we are looking at the channel G16-ABI-CONUS-BAND02, which collects cloud data from a high-res full-continental US image collected by the GOES-east satellite

In [546]:
#identify available datasets:

def available_datasets_by_time(channel):
    abi_band2_list = []
    base_url = "https://realearth.ssec.wisc.edu/api/times?products="
    #url = requests.get(base_url + str(channel))
    url = base_url + channel
    r = requests.get(url)
    list_text = ast.literal_eval(r.text)
    for val in list_text.values():
        if val not in abi_band2_list:
            abi_band2_list.append(val)
    return abi_band2_list

## Next, subset this list to only contain the first timestamp for each hour

The reasoning behind this is that the current dataset of weather values from DarkskyAPI is hourly and we are looking for one image per weather instance

In [202]:
#look for unique combinations between date and hour stamps and use these to build a new list:

def unique_time_stamps(input_list):
    output_list = []
    for i in range(len(input_list[0])):
        date, hour = input_list[0][i][0:8], input_list[0][i][-6:-4]
        if date+hour not in output_list:
            output_list.append(date+hour)
            #output_list.append(date[0:4] + "-" + date[4:6] + "-" + date[6:8] + "+" + hour)
            
    return output_list     

In [203]:
#reformat times:

def reformat_times(input_list):
    output_list = []
    for i in input_list:
        output_list.append(i[0:4] + "-" + i[4:6] + "-" + i[6:8] + "+" + i[8:10])
    return output_list

In [226]:
#generate list of urls:

def url_list(list_of_times):
    url_output_list = []
    base_url = "https://realearth.ssec.wisc.edu/api/image?products=G16-ABI-CONUS-BAND02&time="
    end_url = "&center=37.77,-122.41&zoom=9&width=150&height=150"
    for i in list_of_times:
        url_output_list.append(base_url + i + end_url)
    return url_output_list

## Pull images for each url:

Images are pulled as pngs, then saved as 1D arrays (which can be used as input for ML and re-viewed via reshaping).


In [206]:
from PIL import Image
import requests
from io import BytesIO
import numpy as np

In [362]:
#take input url, retrieve png, and flatten to 1D array (C-style):

def flatten_png_from_url(input_url):
    response = requests.get(input_url)
    if response.status_code == 200:        
        input_png = Image.open(BytesIO(response.content))  
        as_array = np.array(input_png)[:,:,0]
        flat_array = as_array.flatten()
        return flat_array

In [373]:
#build images dataframe with urls and timestamps

images_df = pd.DataFrame(columns=["url", "img_array"])
images_df["url"] = [url_list(time) for time in image_times]
images_df["img_array"] = [flatten_png_from_url(url) for url in input_urls]
images_df["time_stamps"] = [time for time in unique_times]

In [393]:
#Save an example image:

url = "https://realearth.ssec.wisc.edu/api/image?products=G16-ABI-CONUS-BAND02&time=2018-08-05+18&center=37.77,-122.41&zoom=9&width=300&height=300"
response = requests.get(url)
image = Image.open(BytesIO(response.content))
image
image.save("realearth20180805SF.png")

## Save resulting dataframe. This will be updated periodically to build dataset.

Ideally, the above functions should be built into a pipeline that opens the images_df, appends new data, then re-saves. This should be integrated with the DarkSkyAPI scripts to build a single df with arrays, timestamps, and temp values etc.

In [448]:
#stockton_may_data_converted.to_csv("stockton_may_temp_humidity.csv", encoding='utf-8', index=True)

images_df.to_csv("RealEarthImages080818.csv", encoding='utf-8', index=False)

## Build pipeline for data to allow for easy updating with new data:

Rather than feeding data from one function to another sequentially, a function should be built to manage those handoffs and to open an existing dataframe and append data to it. 

In [691]:
#find new data to append:

def build_new_df(input_df, channel):
    
    """
    Takes existing df and channel as inputs and returns a list of urls for datasets not yet in the existing df.
    To be used to append new data to an existing df.
    """
    #read in old df and initialize a list for time points that do not occur in the old df:
    new_timepoints = []
    df = pd.read_csv(input_df)
    all_datasets = available_datasets_by_time(channel)
    datasets_by_hour = unique_time_stamps(all_datasets)
    
    #set list of time points from old df:
    cross_check = list(df["time_stamps"])
    
    #only append time points from new df that were not in old df:
    for item in datasets_by_hour:
        if int(item) not in cross_check:
            new_timepoints.append(item)
    image_times = reformat_times(new_timepoints)
    
    #collect urls to scrape for new df:
    input_urls = url_list(image_times)
    
    #build new df:
    new_df = pd.DataFrame(columns=["url", "img_array"])
    new_df["url"] = [url_list(time) for time in image_times]
    new_df["img_array"] = [flatten_png_from_url(url) for url in input_urls]
    new_df["time_stamps"] = [time for time in image_times]
    
    #merge new and old dfs and reset index:
    merged_df = pd.concat([df, new_df]).reset_index(drop=True)
    
    #save as csv with current date and time such that most recent file can be referenced on next build:
    
    timestr = time.strftime("%Y%m%d-%H%M%S")
    merged_df.to_csv("output_csv_files/" + timestr + "_" + channel + "_Merge.csv", encoding='utf-8', index=False)
    
    return merged_df

In [692]:
new_df = build_new_df("RealEarthImages080818.csv", "G16-ABI-CONUS-BAND02")

In [697]:
%%bash
cd output_csv_files
ls

20180809-115217_G16-ABI-CONUS-BAND02_Merge.csv
