In [26]:
import sys,os,glob
import numpy as np
import xarray as xr
import pandas as pd
import datetime
from PIL import Image
from osgeo import gdal
import requests
import gzip
import shutil
import rioxarray

# Downloading and preparing MODIS inundation maps
this loads tif files for individual days, merges them and converts to netcdf format for easier ingestion in other scripts

In [27]:
def getModisDates(_startDate, _endDate):
    #returns pandas datetimeindex
    _dataJDays=range(1,365,8)
    _allDays=pd.date_range(_startDate,_endDate,freq="D")
    return _allDays[np.in1d([x.timetuple().tm_yday for x in _allDays],_dataJDays)]

In [28]:
#endDate should be either date as YYYY-MM-DD or "today"
endDate="2020-02-01"
endDate="today"

#either date as YYYY-MM-DD or number of days before the endDate
startDate="2020-01-01"
startDate=70

remoteserver="http://www.okavangodata.ub.bw"
remotedir="{}/modis/products_okavango/".format(remoteserver)
#http://www.okavangodata.ub.bw/modis/products_okavango/A2022249.flood.tif

datadir="../data/flood/"
tifdir=datadir+"/tif/"
filenamepattern="A{}.flood.tif"
mergedfile="flood_modis_merged.nc"

verbose=True

##########################################################################################################
try:
    endDate=datetime.datetime.strptime(endDate, "%Y-%m-%d")
except:
    endDate=datetime.datetime.today() #today

try:
    startDate=datetime.datetime.strptime(startDate, "%Y-%m-%d")
except:
    if not isinstance(startDate,int):
        startDate=30
    startDate=endDate - datetime.timedelta(startDate)

if endDate<=startDate:
    print("startDate has to be before endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()

print("checking data between {} and {}".format(startDate,endDate))

dates2check=getModisDates(startDate,endDate)
if len(dates2check)==0:
    print("There are no data dates between startDate and endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()
    
count=0
update=False

#checking if server is up
print("checking if {} is up".format(remoteserver))
cont=False
try:
    response=requests.head(remoteserver,timeout=30)
    response.raise_for_status()
    cont=True
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)
    
if cont:
    #server is up
    for date in dates2check:
        print("checking {}".format(date))
        file=filenamepattern.format(date.strftime("%Y%j"))
        filepath=tifdir+"/"+file
        if os.path.exists(filepath):
            if verbose:
                print("file {} exists locally. skipping...".format(file))
        else:
            if verbose:
                print("Downloading {} from {} into {}".format(file,remotedir,tifdir))
            url=remotedir+"/"+file
            response=requests.head(url)
            if response.status_code!=200:
                if verbose:
                    print("file {} on {} does not exist. skipping...".format(file,remotedir))
            else:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    with open(filepath, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=8192): 
                            f.write(chunk)

                localfilesize=os.stat(filepath)[6]
                remotefilesize=int(response.headers['content-length'])
                if verbose:
                    print ("downloaded file size: {}".format(localfilesize))
                    print ("expected file size: {}".format(remotefilesize))
                if localfilesize != remotefilesize:
                    if verbose:
                        print("something went wrong. removing downloaded file")
                    os.rename(filepath, filepath+".fail")
                else:
                    count=count+1
                    if verbose:
                        print("download successful")
else:
    print("server {} is down.".format(remoteserver))
    update=True

checking data between 2022-08-09 08:33:23.012673 and 2022-10-18 08:33:23.012673
checking if http://www.okavangodata.ub.bw is up
checking 2022-08-13 08:33:23.012673
file A2022225.flood.tif exists locally. skipping...
checking 2022-08-21 08:33:23.012673
file A2022233.flood.tif exists locally. skipping...
checking 2022-08-29 08:33:23.012673
file A2022241.flood.tif exists locally. skipping...
checking 2022-09-06 08:33:23.012673
file A2022249.flood.tif exists locally. skipping...
checking 2022-09-14 08:33:23.012673
file A2022257.flood.tif exists locally. skipping...
checking 2022-09-22 08:33:23.012673
file A2022265.flood.tif exists locally. skipping...
checking 2022-09-30 08:33:23.012673
file A2022273.flood.tif exists locally. skipping...
checking 2022-10-08 08:33:23.012673
Downloading A2022281.flood.tif from http://www.okavangodata.ub.bw/modis/products_okavango/ into ../data/flood//tif/
file A2022281.flood.tif on http://www.okavangodata.ub.bw/modis/products_okavango/ does not exist. skippi

In [29]:
update=True
if count>0 or update:
    print("Downloaded {} new files. Updating merged file".format(count))

    files=glob.glob(tifdir+"/*flood.tif")

    print("processing {} files in {}".format(len(files), tifdir))
    i=0
    for file in np.sort(files):
        i=i+1
        if i%100==0:
            print(i)
        filedatestr=os.path.basename(file)[1:8]
        filedate=datetime.datetime.strptime(filedatestr, '%Y%j')
        im=Image.open(file)
        data=np.array(im)
        if i==1:
            dates=[filedate]
            alldata=data[:,:,0:1]
            ds = gdal.Open(file)
            nx = ds.RasterXSize
            ny = ds.RasterYSize
            xmin,xsize,tmp,ymin,tmp,ysize=ds.GetGeoTransform()
            lons=np.linspace(xmin,xmin+xsize*nx,num=nx)
            lats=np.linspace(ymin,ymin+ysize*ny,num=ny)
        else:
            dates=dates+[filedate]
            alldata=np.append(alldata,data[:,:,0:1],2)
        im.close()
    firstdatestr,lastdatestr=datetime.datetime.strftime(dates[0],"%Y-%m-%d"),datetime.datetime.strftime(dates[-1],"%Y-%m-%d")

    print("Found {} files covering period between {} and {}".format(len(dates),firstdatestr,lastdatestr))

    #ordering axes in the array so that is has the standard time,lat,lon
    alldata=alldata.swapaxes(0,2)
    alldata=alldata.swapaxes(1,2)

    #recoding
    #tif files contain only 3 (0 for flooded, 255 for not flooded, and 127 for unclassified)
    # this is recoded here to 1 for flooded,0 for not flooded and np.nan for unclassified
    alldata[alldata==0]=1
    alldata[alldata==255]=0
    alldata[alldata==127]=2

    ds = xr.Dataset(
        {"flood": (("time", "latitude","longitude"), alldata)},
        coords={
            "longitude": lons,
            "latitude": lats,
            "time": dates,
        },
    )
    ds["latitude"].attrs = {"units":"degrees_north",'standard_name':"latitude",'axis':"Y"}
    ds["longitude"].attrs = {"units":"degrees_east",'standard_name':"longitude",'axis':"X"}

    #flood=flood.rio.write_crs("epsg:4326")

    mergedfilepath=datadir+"/"+mergedfile

    print("writing netcdf file: {}".format(mergedfilepath))
    ds.to_netcdf(mergedfilepath)
    print("finished")
else:
    print("No new files downloaded. Skipping updating merged file")


Downloaded 0 new files. Updating merged file
processing 711 files in ../data/flood//tif/
100
200
300
400
500
600
700
Found 711 files covering period between 2000-04-30 and 2022-09-30
writing netcdf file: ../data/flood//flood_modis_merged.nc
finished


# Downloading and preparing CHIRPS rainfall data
this loads tif files for individual days, merges them and converts to netcdf format for easier ingestion in other scripts

In [34]:
#endDate should be either date as YYYY-MM-DD or "today"
endDate="2022-10-01"
endDate="today"

#either date as YYYY-MM-DD or number of days before the endDate
startDate="1981-01-01"


remoteserver="https://data.chc.ucsb.edu"
remotedir_monthly="{}/products/CHIRPS-2.0/africa_monthly/tifs/".format(remoteserver)
remotedir_prelim="{}/products/CHIRPS-2.0/prelim/global_daily/tifs/p25/".format(remoteserver)


datadir="../data/rainfall/"
tifdir_monthly=datadir+"/chirps-v2.0/tifs/"
filenamepattern_monthly="chirps-v2.0.{}.tif"
tifdir_prelim=datadir+"/chirps-v2.0-prelim/tifs/"
filenamepattern_prelim="chirps-v2.0.{}.tif"

mergedfile="pr_mon_CHG_CHIRPS-2.0-0p25-prelim_merged_okavango.nc"
mergedfilepath="{}/{}".format(datadir, mergedfile)


minlon,maxlon=15,26
minlat,maxlat=-11,-22


verbose=True

##########################################################################################################
try:
    endDate=datetime.datetime.strptime(endDate, "%Y-%m-%d")
except:
    endDate=datetime.datetime.today() #today

try:
    startDate=datetime.datetime.strptime(startDate, "%Y-%m-%d")
except:
    if not isinstance(startDate,int):
        #this should not be lower than 50 - this is because chirps gets updated in monthly batches,
        # the entire month gets updates on the 16th of the next month, so on the 16th we need to check for
        # and update data since the beginnng of the previous month
        startDate=60 
    startDate=endDate - datetime.timedelta(startDate)

if endDate<=startDate:
    print("startDate has to be before endDate. You got: \nstartDate:{} \nendDate:{}. \nExiting...".format(startDate,endDate))
    sys.exit()

print("checking monthly data between {} and {}".format(startDate,endDate))
dates2check_monthly=pd.date_range(startDate.strftime("%Y-%m-%d"),endDate.strftime("%Y-%m-%d"), freq="M")


checking monthly data between 1981-01-01 00:00:00 and 2022-10-18 08:46:50.686888


In [35]:
def download_file(_url,_localfilepath, verbose=True):
    response=requests.head(_url)
    if response.status_code!=200:
        if verbose:
            print(" {} does not exist. skipping...".format(_url))
        return False
    else:
        with requests.get(_url, stream=True) as r:
            r.raise_for_status()
            with open(_localfilepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192): 
                    f.write(chunk)

        localfilesize=os.stat(_localfilepath)[6]
        remotefilesize=int(response.headers['content-length'])
        if verbose:
            print ("downloaded file size: {}".format(localfilesize))
            print ("expected file size: {}".format(remotefilesize))
        if localfilesize != remotefilesize:
            if verbose:
                print("something went wrong. removing downloaded file")
            os.rename(_localfilepath, _localfilepath+".fail")
            return False
        else:
            if verbose:
                print("download successful")
            return True


In [36]:
count=0
update=False

#checking if server is up
print("checking if {} is up".format(remoteserver))
cont=False
try:
    response=requests.head(remoteserver)
    response.raise_for_status()
    cont=True
    print("it is up")
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("Oops: Something Else",err)

checking if https://data.chc.ucsb.edu is up
it is up


In [37]:
if cont:
    #server is up
    missingdays=0
    for mdate in dates2check_monthly:
    #https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_monthly/tifs/chirps-v2.0.1981.01.tif.gz
        file_monthly=filenamepattern_monthly.format(mdate.strftime("%Y.%m"))
        print(file_monthly)
        gzipfile_monthly="{}.gz".format(file_monthly)
        localfilepath_monthly=tifdir_monthly+"/"+file_monthly
        localgzipfilepath_monthly=tifdir_monthly+"/"+gzipfile_monthly
        if os.path.exists(localfilepath_monthly):
            if verbose:
                cont=True
                print("file {} exists locally. skipping...".format(file_monthly))
        else:
            if verbose:
                print("Checking if gzip file {} exists".format(gzipfile_monthly))
            if os.path.exists(localgzipfilepath_monthly):
                if verbose:
                    print("gzip file exists. unzipping...")
                with gzip.open(localgzipfilepath_monthly) as f_in:
                    with open(localfilepath_monthly, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                if verbose:
                    print("Downloading {} from {} into {}".format(gzipfile_monthly,remotedir_monthly,tifdir_monthly))
                url="{}/{}".format(remotedir_monthly,gzipfile_monthly)
                response=download_file(url,localgzipfilepath_monthly)
                if response:
                    if verbose:
                        print("unzipping {} into {}".format(gzipfile_monthly,file_monthly))
                    with gzip.open(localgzipfilepath_monthly) as f_in:
                        with open(localfilepath_monthly, 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                else:
                    #downloading prelim file
                    missingdays = pd.Period(date.strftime("%Y.%m.%d")).days_in_month
                    for day in range(1,missingdays+1):
                        prelimfile=prelimfilenamepattern.format(date.strftime("%Y.%m.{}".format(str(day).zfill(2))))
                        prelimlocalfilepath=prelimtifdir+"/"+prelimfile
                        if os.path.exists(prelimlocalfilepath):
                            if verbose:
                                print("file {} exists locally. skipping...".format(prelimfile))
                        else:
                            if verbose:
                                print("Downloading {}/{}".format(prelimremotedir,prelimfile))
                            url="{}/{}/{}".format(prelimremotedir,date.strftime("%Y"),prelimfile)
                            response2=download_file(url,prelimlocalfilepath)
            
else:
    print("server {} is down.".format(remoteserver))
    update=True



chirps-v2.0.1981.01.tif
file chirps-v2.0.1981.01.tif exists locally. skipping...
chirps-v2.0.1981.02.tif
file chirps-v2.0.1981.02.tif exists locally. skipping...
chirps-v2.0.1981.03.tif
file chirps-v2.0.1981.03.tif exists locally. skipping...
chirps-v2.0.1981.04.tif
file chirps-v2.0.1981.04.tif exists locally. skipping...
chirps-v2.0.1981.05.tif
file chirps-v2.0.1981.05.tif exists locally. skipping...
chirps-v2.0.1981.06.tif
file chirps-v2.0.1981.06.tif exists locally. skipping...
chirps-v2.0.1981.07.tif
file chirps-v2.0.1981.07.tif exists locally. skipping...
chirps-v2.0.1981.08.tif
file chirps-v2.0.1981.08.tif exists locally. skipping...
chirps-v2.0.1981.09.tif
file chirps-v2.0.1981.09.tif exists locally. skipping...
chirps-v2.0.1981.10.tif
file chirps-v2.0.1981.10.tif exists locally. skipping...
chirps-v2.0.1981.11.tif
file chirps-v2.0.1981.11.tif exists locally. skipping...
chirps-v2.0.1981.12.tif
file chirps-v2.0.1981.12.tif exists locally. skipping...
chirps-v2.0.1982.01.tif
file

In [15]:
files=glob.glob("{}/{}".format(tifdir_monthly,filenamepattern_monthly.format("*")))
print("processing {} files in {}".format(len(files), tifdir_monthly))

i=0
for file in np.sort(files):
    i=i+1
    if i%100==0:
        print(i)
    filedatestr=os.path.basename(file)[12:19]
    filedate=datetime.datetime.strptime(filedatestr, '%Y.%m')
    data=xr.open_dataset(file).band_data[0,:,:].sel(x=slice(minlon,maxlon), y=slice(minlat,maxlat)).expand_dims(time=[filedate])
    data.name="pr"
    if i==1:
        ds=data.copy(deep=True)
    else:
        ds=xr.concat([ds,data], dim="time")
#        sys.exit()
        

ds = ds.coarsen(x=5, y=5, boundary='pad').mean()
ds=ds.assign_coords({"y":np.round(ds.y.data,3), "x":np.round(ds.x.data,3)})

processing 501 files in ../data/rainfall//chirps-v2.0/tifs/
100
200
300
400
500


In [22]:
if missingdays>0:
    files=glob.glob("{}/{}".format(tifdir_prelim,filenamepattern_prelim.format(mdate.strftime("%Y.%m.*"))))
    print("processing {} files in {}".format(len(files), tifdir_prelim))

    if len(files)==ndays:
        for j in range(ndays):
            filedatestr=mdate.strftime("%Y.%m.{}".format(str(j+1).zfill(2)))
            filedate=datetime.datetime.strptime(filedatestr, '%Y.%m.%d')
            file="{}/{}".format(tifdir_prelim,filenamepattern_prelim.format(filedatestr))
            data=xr.open_dataset(file).band_data[0,:,:].sel(x=slice(minlon,maxlon), y=slice(minlat,maxlat)).expand_dims(time=[filedate])
            if j==0:
                dsp=data.copy(deep=True)
            else:
                dsp=xr.concat([dsp,data], dim="time")
        dsp=dsp.sum("time").expand_dims(time=[mdate])
        print("merging monthly and prelim files")
        ds=xr.concat([ds,dsp],dim="time")
        ds=ds.rename({"x":"longitude","y":"latitude"})
    else:
        print("month has {} days, but there are only {} available. skipping..".format(ndays,len(files)))
else:
    print("monthly data available for the entire period. No need to process prelim data. Skipping...")

monthly data available for the entire period. No need to process prelim data. Skipping...


In [25]:
print("writing merged netcdf file {}".format(mergedfile))
if os.path.exists(mergedfilepath):
    os.remove(mergedfilepath)
print(ds.shape)
ds.to_netcdf(mergedfilepath)

writing merged netcdf file pr_mon_CHG_CHIRPS-2.0-0p25-prelim_merged_okavango.nc
(501, 44, 44)
