In [2]:
import sys,os,glob
import numpy as np
import xarray as xr
import pandas as pd
import datetime
from PIL import Image
from osgeo import gdal
import requests

# Downloading and preparing MODIS inundation maps
this loads tif files for individual days, merges them and converts to netcdf format for easier ingestion in other scripts

In [3]:
def getModisDates(_startDate, _endDate):
    #returns pandas datetimeindex
    _dataJDays=range(1,365,8)
    _allDays=pd.date_range(_startDate,_endDate,freq="D")
    return _allDays[np.in1d([x.timetuple().tm_yday for x in _allDays],_dataJDays)]

In [7]:
#endDate should be either date as YYYY-MM-DD or "today"
endDate="2020-02-01"
endDate="today"

#either date as YYYY-MM-DD or number of days before the endDate
startDate="2020-01-01"
startDate=25

remoteserver="https://www.okavangodata.ub.bw"
remotedir="{}/modis/".format(remoteserver)


datadir="../data/flood/"
tifdir=datadir+"/tif/"
filenamepattern="A{}.flood.tif"
mergedfile="flood_modis_merged.nc"

verbose=True

##########################################################################################################
try:
    endDate=datetime.datetime.strptime(endDate, "%Y-%m-%d")
except:
    endDate=datetime.datetime.today() #today

try:
    startDate=datetime.datetime.strptime(startDate, "%Y-%m-%d")
except:
    if not isinstance(startDate,int):
        startDate=30
    startDate=endDate - datetime.timedelta(startDate)

if endDate<=startDate:
    print("startDate has to be before endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()

print("checking data between {} and {}".format(startDate,endDate))

dates2check=getModisDates(startDate,endDate)
if len(dates2check)==0:
    print("There are no data dates between startDate and endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()
    
count=0
update=False

#checking if server is up
print("checking if {} is up".format(remoteserver))
cont=False
try:
    response=requests.head(remoteserver,timeout=30)
    response.raise_for_status()
    cont=True
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)
    
if cont:
    #server is up
    for date in dates2check:
        file=filenamepattern.format(date.strftime("%Y%j"))
        filepath=tifdir+"/"+file
        if os.path.exists(filepath):
            if verbose:
                print("file {} exists locally. skipping...".format(file))
        else:
            if verbose:
                print("Downloading {} from {} into {}".format(file,remotedir,tifdir))
            url=remotedir+"/"+file
            response=requests.head(url)
            if response.status_code!=200:
                if verbose:
                    print("file {} on {} does not exist. skipping...".format(file,remotedir))
            else:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    with open(filepath, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=8192): 
                            f.write(chunk)

                localfilesize=os.stat(filepath)[6]
                remotefilesize=int(response.headers['content-length'])
                if verbose:
                    print ("downloaded file size: {}".format(localfilesize))
                    print ("expected file size: {}".format(remotefilesize))
                if localfilesize != remotefilesize:
                    if verbose:
                        print("something went wrong. removing downloaded file")
                    os.rename(filepath, filepath+".fail")
                else:
                    count=count+1
                    if verbose:
                        print("download successful")
else:
    print("server {} is down.".format(remoteserver))
    update=True

checking data between 2022-08-23 06:51:25.558584 and 2022-09-17 06:51:25.558584
checking if https://www.okavangodata.ub.bw is up
Error Connecting: HTTPSConnectionPool(host='www.okavangodata.ub.bw', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f40415510a0>, 'Connection to www.okavangodata.ub.bw timed out. (connect timeout=30)'))
server https://www.okavangodata.ub.bw is down.


In [4]:
if count>0 or update:
    print("Downloaded {} new files. Updating merged file".format(count))

    files=glob.glob(tifdir+"/*flood.tif")

    print("processing files in {}".format(tifdir))
    i=0
    for file in np.sort(files):
        i=i+1
        if i%100==0:
            print(i)
        filedatestr=os.path.basename(file)[1:8]
        filedate=datetime.datetime.strptime(filedatestr, '%Y%j')
        im=Image.open(file)
        data=np.array(im)
        if i==1:
            dates=[filedate]
            alldata=data[:,:,0:1]
            ds = gdal.Open(file)
            nx = ds.RasterXSize
            ny = ds.RasterYSize
            xmin,xsize,tmp,ymin,tmp,ysize=ds.GetGeoTransform()
            lons=np.linspace(xmin,xmin+xsize*nx,num=nx)
            lats=np.linspace(ymin,ymin+ysize*ny,num=ny)
        else:
            dates=dates+[filedate]
            alldata=np.append(alldata,data[:,:,0:1],2)
        im.close()
    firstdatestr,lastdatestr=datetime.datetime.strftime(dates[0],"%Y-%m-%d"),datetime.datetime.strftime(dates[-1],"%Y-%m-%d")

    print("Found {} files covering period between {} and {}".format(len(dates),firstdatestr,lastdatestr))

    #ordering axes in the array so that is has the standard time,lat,lon
    alldata=alldata.swapaxes(0,2)
    alldata=alldata.swapaxes(1,2)

    #recoding
    #tif files contain only 3 (0 for flooded, 255 for not flooded, and 127 for unclassified)
    # this is recoded here to 1 for flooded,0 for not flooded and np.nan for unclassified
    alldata[alldata==0]=1
    alldata[alldata==255]=0
    alldata[alldata==127]=2

    ds = xr.Dataset(
        {"flood": (("time", "latitude","longitude"), alldata)},
        coords={
            "longitude": lons,
            "latitude": lats,
            "time": dates,
        },
    )
    ds["latitude"].attrs = {"units":"degrees_north",'standard_name':"latitude",'axis':"Y"}
    ds["longitude"].attrs = {"units":"degrees_east",'standard_name':"longitude",'axis':"X"}

    #flood=flood.rio.write_crs("epsg:4326")

    mergedfilepath=datadir+"/"+mergedfile

    print("writing netcdf file: {}".format(mergedfilepath))
    ds.to_netcdf(mergedfilepath)
    print("finished")
else:
    print("No new files downloaded. Skipping updating merged file")


Downloaded 0 new files. Updating merged file
processing files in ../data/flood//tif/
100
200
300
400
500
600
700
Found 704 files covering period between 2000-04-30 and 2022-07-12
writing netcdf file: ../data/flood//flood_modis_merged.nc
finished


# Downloading and preparing CHIRPS rainfall data
this loads tif files for individual days, merges them and converts to netcdf format for easier ingestion in other scripts

In [47]:
#endDate should be either date as YYYY-MM-DD or "today"
endDate="2020-02-01"
endDate="today"

#either date as YYYY-MM-DD or number of days before the endDate
startDate="2020-01-01"
startDate=25


remoteserver="https://data.chc.ucsb.edu"
remotedir="{}/products/CHIRPS-2.0/africa_daily/tifs/p25/".format(remoteserver)
prelimremotedir="{}/products/CHIRPS-2.0/prelim/global_daily/tifs/p25/".format(remoteserver)


datadir="../data/rainfall/"
tifdir=datadir+"/chirps-v2.0/tifs/"
filenamepattern="chirps-v2.0.{}.tif"
prelimtifdir=datadir+"/chirps-v2.0-prelim/tifs/"
prelimfilenamepattern="chirps-v2.0.{}.tif"

mergedfile="pr_chirps_merged.nc"


verbose=True

##########################################################################################################
try:
    endDate=datetime.datetime.strptime(endDate, "%Y-%m-%d")
except:
    endDate=datetime.datetime.today() #today

try:
    startDate=datetime.datetime.strptime(startDate, "%Y-%m-%d")
except:
    if not isinstance(startDate,int):
        #this should not be lower than 50 - this is because chirps gets updated in monthly batches,
        # the entire month gets updates on the 16th of the next month, so on the 16th we need to check for
        # and update data since the beginnng of the previous month
        startDate=60 
    startDate=endDate - datetime.timedelta(startDate)

if endDate<=startDate:
    print("startDate has to be before endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()

print("checking data between {} and {}".format(startDate,endDate))
dates2check=pd.date_range(startDate.strftime("%Y-%m-%d"),endDate.strftime("%Y-%m-%d"), freq="D")


checking data between 2022-08-23 07:41:41.073665 and 2022-09-17 07:41:41.073665


In [48]:
def download_file(_url,_localfilepath, verbose=True):
    response=requests.head(_url)
    if response.status_code!=200:
        if verbose:
            print(" {} does not exist. skipping...".format(_url))
        return False
    else:
        with requests.get(_url, stream=True) as r:
            r.raise_for_status()
            with open(_localfilepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192): 
                    f.write(chunk)

        localfilesize=os.stat(_localfilepath)[6]
        remotefilesize=int(response.headers['content-length'])
        if verbose:
            print ("downloaded file size: {}".format(localfilesize))
            print ("expected file size: {}".format(remotefilesize))
        if localfilesize != remotefilesize:
            if verbose:
                print("something went wrong. removing downloaded file")
            os.rename(_localfilepath, _localfilepath+".fail")
            return False
        else:
            if verbose:
                print("download successful")
            return True


In [50]:
count=0
update=False

#checking if server is up
print("checking if {} is up".format(remoteserver))
cont=False
try:
    response=requests.head(remoteserver)
    response.raise_for_status()
    cont=True
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("Oops: Something Else",err)
    
if cont:
    #server is up
    for date in dates2check:
        file=filenamepattern.format(date.strftime("%Y.%m.%d"))
        localfilepath=tifdir+"/"+file
        if os.path.exists(localfilepath):
            if verbose:
                print("file {} exists locally. skipping...".format(file))
        else:
            if verbose:
                print("Downloading {} from {} into {}".format(file,remotedir,tifdir))
            url="{}/{}/{}".format(remotedir,date.strftime("%Y"),file)
            response=download_file(url,localfilepath)
            #downloading prelim file
            if response==False:
                prelimfile=prelimfilenamepattern.format(date.strftime("%Y.%m.%d"))
                prelimlocalfilepath=prelimtifdir+"/"+prelimfile
                if os.path.exists(prelimlocalfilepath):
                    if verbose:
                        print("file {} exists locally. skipping...".format(prelimfile))
                else:
                    if verbose:
                        print("Downloading {}/{}".format(prelimremotedir,prelimfile))
                    url="{}/{}/{}".format(prelimremotedir,date.strftime("%Y"),prelimfile)
                    response=download_file(url,prelimlocalfilepath)
#                    sys.exit()
else:
    print("server {} is down.".format(remoteserver))
    update=True

checking if https://data.chc.ucsb.edu is up
file chirps-v2.0.2022.08.23.tif exists locally. skipping...
file chirps-v2.0.2022.08.24.tif exists locally. skipping...
file chirps-v2.0.2022.08.25.tif exists locally. skipping...
file chirps-v2.0.2022.08.26.tif exists locally. skipping...
file chirps-v2.0.2022.08.27.tif exists locally. skipping...
file chirps-v2.0.2022.08.28.tif exists locally. skipping...
file chirps-v2.0.2022.08.29.tif exists locally. skipping...
file chirps-v2.0.2022.08.30.tif exists locally. skipping...
file chirps-v2.0.2022.08.31.tif exists locally. skipping...
Downloading chirps-v2.0.2022.09.01.tif from https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p25/ into ../data/rainfall//chirps-v2.0/tifs/
 https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p25//2022/chirps-v2.0.2022.09.01.tif does not exist. skipping...
file chirps-v2.0.2022.09.01.tif exists locally. skipping...
Downloading chirps-v2.0.2022.09.02.tif from https://data.chc.ucsb.edu/pr

 https://data.chc.ucsb.edu/products/CHIRPS-2.0/prelim/global_daily/tifs/p25//2022/chirps-v2.0.2022.09.16.tif does not exist. skipping...
Downloading chirps-v2.0.2022.09.17.tif from https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p25/ into ../data/rainfall//chirps-v2.0/tifs/
 https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p25//2022/chirps-v2.0.2022.09.17.tif does not exist. skipping...
Downloading https://data.chc.ucsb.edu/products/CHIRPS-2.0/prelim/global_daily/tifs/p25//chirps-v2.0.2022.09.17.tif
 https://data.chc.ucsb.edu/products/CHIRPS-2.0/prelim/global_daily/tifs/p25//2022/chirps-v2.0.2022.09.17.tif does not exist. skipping...
