# Okavango CHIRPS data processing

The following code reads in raster data of precipitation from the CHIRPS dataset to calculate monthly precipitation data for the study period in the upper watershed, as well as within a 100 km buffer of the study area.

Author: James (Huck) Rees;
        PhD Student, UCSB Geography
        
Date: February 29, 2024

# Reset code if needed

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# 1. Import packages

In [2]:
import geemap
import ee
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import statistics as stats
import scipy.stats as stats
import json
import folium
from ipyleaflet import Map, DrawControl
from shapely.geometry import shape
from io import StringIO
from shapely.geometry import shape
import os
import rasterio
from rasterio.mask import mask
from shapely.geometry import box
from rasterio.plot import show
from shapely.geometry import mapping
import requests
import os
import gzip
import shutil
from datetime import datetime
import fiona
import re
import matplotlib.dates as mdates
from itertools import cycle
import csv
ee.Initialize()

# CHIRPS Data Processing and Preliminary Analysis

## 1. Scrape monthly CHIRPS data from online repository
Be cautious with this function. It will take a while to run 

In [3]:
def download_and_extract_chirps(start_year, end_year, target_directory):
    base_url = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_dekad/tifs/"
    
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            for dekad in range(1, 4):
                filename = f"chirps-v2.0.{year}.{month:02d}.{dekad}.tif.gz"
                url = f"{base_url}/{filename}"
                
                # Download the file
                print(f"Downloading {url}")
                response = requests.get(url, stream=True)
                if response.status_code == 200:
                    gz_path = os.path.join(target_directory, filename)
                    with open(gz_path, 'wb') as f:
                        f.write(response.content)
                    
                    # Extract the .tif file
                    tif_path = gz_path[:-3]  # Remove the '.gz' extension
                    with gzip.open(gz_path, 'rb') as f_in:
                        with open(tif_path, 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    
                    # Optionally, remove the .gz file after extraction
                    os.remove(gz_path)
                else:
                    print(f"Failed to download {url}")

## 2. Clip rasters to study area boundaries and get precipitation at each dekad time step

In [4]:
def rasterio_clip_and_calculate_mean(shapefile_path, raster_path, output_raster_path):
    with fiona.open(shapefile_path, "r") as shapefile:
        shapes = [feature["geometry"] for feature in shapefile]
    
    with rasterio.open(raster_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shapes, crop=True, nodata=0)
        out_meta = src.meta
    
    masked_array = np.ma.masked_equal(out_image, 0)
    mean_value = masked_array.mean()
    
    if np.ma.is_masked(mean_value):
        mean_value = np.nan  # Handle completely masked arrays
    
    out_meta.update({"driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform})
    with rasterio.open(output_raster_path, "w", **out_meta) as dest:
        dest.write(out_image)
    
    return mean_value

def process_folder(input_folder, shapefile_path, output_folder):
    raster_files = [f for f in os.listdir(input_folder) if f.endswith('.tif')]
    results = []  # Use a list to collect dictionaries

    for raster_file in raster_files:
        raster_path = os.path.join(input_folder, raster_file)
        output_raster_path = os.path.join(output_folder, f"Clipped_{raster_file}")
        
        match = re.search(r'(\d{4})\.(\d{2})\.(\d)\.tif', raster_file)
        if match:
            year, month, dekad = match.groups()
            # Approximate the day based on dekad
            day_map = {'1': '05', '2': '15', '3': '25'}
            approx_day = day_map[dekad]
            date = f"{year}-{month}-{approx_day}"
            
            mean_value = rasterio_clip_and_calculate_mean(shapefile_path, raster_path, output_raster_path)
            
            # Add the constructed date and mean value to the results list
            results.append({"Date": pd.to_datetime(date, format="%Y-%m-%d"), "Year": year, "Month": month, "Dekad": dekad, "Mean pixel value": mean_value})
        else:
            print(f"Filename does not match expected format and will be skipped: {raster_file}")

    # Convert the list of dictionaries to a DataFrame and sort by Date
    df = pd.DataFrame(results).sort_values(by="Date")
    return df

## 3. Creates dataframe of cumulative precipitation by water year from dataframe of dekad precipitation

In [5]:
def add_cumulative_rainfall(df):
    # Convert 'Date' to datetime if it's not already
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Define a function to determine the water year
    def get_water_year(date):
        if date.month >= 7:
            return date.year + 1  # Water year starts in July
        else:
            return date.year
    
    # Apply the function to create a new 'Water Year' column
    df['Water Year'] = df['Date'].apply(get_water_year)
    
    # Sort the DataFrame by Date to ensure correct cumulative sum calculation
    df = df.sort_values(by='Date')
    
    # Group by 'Water Year' and calculate the cumulative sum of precipitation within each group
    df['Cumulative Rainfall'] = df.groupby('Water Year')['Mean pixel value'].cumsum()
    
    return df

## 4. Calculate total precipitation for the water year

In [6]:
def summarize_precipitation_by_wateryear(results_df):
    # Ensure 'Date' column is of datetime type
    results_df['Date'] = pd.to_datetime(results_df['Date'])

    # Generate 'Water Year' column based on the date
    results_df['Water Year'] = results_df['Date'].apply(lambda x: f"{x.year-1}_{x.year}" if x.month < 7 else f"{x.year}_{x.year+1}")

    # Group by 'Water Year' and sum the 'Mean pixel value' for each group
    summary_df = results_df.groupby('Water Year')['Mean pixel value'].sum().reset_index()

    # Rename the columns for clarity
    summary_df.columns = ['Water Year', 'Total Precipitation (mm)']

    return summary_df

## 5. Run code to dowload and process CHIRPS data

In [None]:
# This folder is a destiantion for continent-level dedadal CHIRPS data:
CHIRPS_Dekad_Africa_FolderPath = r'C:\Users\huckr\Desktop\UCSB\Okavango\Data\Precip\Dekad_Africa'
CHIRPS_Dekad_Africa_StartYear = 2024
CHIRPS_Dekad_Africa_EndYear = 2025

# Be cautious with this function. It will take a while to run
download_and_extract_chirps(CHIRPS_Dekad_Africa_StartYear, CHIRPS_Dekad_Africa_EndYear, CHIRPS_Dekad_Africa_FolderPath)

In [11]:
# Once the continent level data is downloaded, we clip it using the shape input below and output the clipped precipitation
# rasters and a csv of mean pixel values to the desitation or "output_folder". Make sure that the shapefile path and output
# folder refer to the same shape:
shapefile_path = r"C:\Users\huckr\Desktop\UCSB\Okavango\Data\StudyAreas\Upstream_Watersheds\US_Watersheds_WGS84.shp"

# Total precipitation by dekad
output_folder = r'C:\Users\huckr\Desktop\UCSB\Okavango\Data\Precip\Dekad_Clipped_Upper_Watershed'
os.makedirs(output_folder, exist_ok=True)
df_results = process_folder(CHIRPS_Dekad_Africa_FolderPath, shapefile_path, output_folder)
df_results.to_csv(os.path.join(output_folder, "total_precip_dekad.csv"), index=False)

# Total precipitation by water year
summary_df = summarize_precipitation_by_wateryear(df_results)
summary_df.to_csv(os.path.join(output_folder, "total_precip_wateryear.csv"), index=False)

   Water Year  Total Precipitation (mm)
0   1980_1981                439.689107
1   1981_1982                821.477437
2   1982_1983                727.917180
3   1983_1984                915.129061
4   1984_1985                795.881200
5   1985_1986                866.317313
6   1986_1987                761.911153
7   1987_1988                793.286300
8   1988_1989                961.226222
9   1989_1990                743.926910
10  1990_1991                778.214030
11  1991_1992                794.199219
12  1992_1993                751.346770
13  1993_1994                780.154533
14  1994_1995                683.490113
15  1995_1996                645.242091
16  1996_1997                870.740113
17  1997_1998                714.776271
18  1998_1999                866.299556
19  1999_2000                852.314074
20  2000_2001                807.607160
21  2001_2002                798.628945
22  2002_2003                736.964548
23  2003_2004                932.327736


# Additional utilities for importing study areas and extracting upstream watersheds

## 1. Import study regions

Pathnames to locally stored shapefiles

In [2]:
# Shapefile path
delta_bnd_path = r"C:\Users\huckr\Desktop\UCSB\Okavango\Data\StudyAreas\Delta_Simplified\Delta_WGS84.shp"
delta_buff_path = r"C:\Users\huckr\Desktop\UCSB\Okavango\Data\StudyAreas\DeltaBuffer\Delta_100km_Buffer_WGS84.shp"
US_watershed_path = r"C:\Users\huckr\Desktop\UCSB\Okavango\Data\StudyAreas\Upstream_Watersheds\US_Watersheds_WGS84.shp"

Geemap feature collection of delta boundary

In [4]:
# Load the study area using geopandas
StudArGDF = gpd.read_file(delta_bnd_path)

# Convert geodataframe to geojson then to FeatureCollection
StudAr = geemap.geopandas_to_ee(StudArGDF)

Code for extracting upstream watershed boundaries (shouldn't need to run)

In [19]:
selected_ids = [1041479540, 1041477980]
watershed_fc = ee.FeatureCollection("WWF/HydroATLAS/v1/Basins/level04") \
    .filter(ee.Filter.inList('HYBAS_ID', selected_ids))

def ee_featurecollection_to_geopandas(fc):
    """Converts a ee.FeatureCollection to a GeoPandas GeoDataFrame."""
    # Extract features from the FeatureCollection
    features = fc.getInfo()['features']
    # Convert features to GeoJSON
    geojsons = [feature['geometry'] for feature in features]
    # Convert GeoJSONs to Shapely geometries and create a GeoDataFrame
    geometries = [shape(geojson) for geojson in geojsons]
    gdf = gpd.GeoDataFrame(geometry=geometries)
    gdf['MAIN_BAS'] = [feature['properties']['MAIN_BAS'] for feature in features]
    return gdf

# Convert directly to GeoPandas GeoDataFrame and dissolve by 'MAIN_BAS'
watersheds = ee_featurecollection_to_geopandas(watershed_fc).dissolve(by='MAIN_BAS')