In [None]:
# # PIP installs

!pip install wxee
!pip install rasterio
!pip install pingouin

In [None]:
# IMPORT DEPENDENCIES

import os
from datetime import datetime
import ee
import wxee
import pandas as pd
import shutil
import rasterio
import seaborn as sns
import pingouin as pg
import pyproj
import json
from io import StringIO

import warnings
warnings.filterwarnings("ignore")

In [None]:
# AUTHENTICATE GOOGLE PROJECT TO FETCH EARTH ENGINE DATA

ee.Authenticate()
wxee.Initialize(project='earth-engine-project-422209') 

In [None]:
# Parquet file for country | wards | sub-county polygons

file_path = 'Kenya_locations.parquet'
parquet_file = pq.ParquetFile(file_path)
row_group = parquet_file.read_row_group(0)
locations = row_group.to_pandas()
locations

In [None]:
# List the number of files downloaded, run this only once you start downloading the files and you get interrupted. 

# s3 = boto3.client('s3')
# def list_and_count_directories(bucket_name, prefix=''):
#     paginator = s3.get_paginator('list_objects_v2')
#     operation_parameters = {'Bucket': bucket_name, 'Prefix': prefix, 'Delimiter': '/'}
#     page_iterator = paginator.paginate(**operation_parameters)

#     directories = set()

#     for page in page_iterator:
#         for common_prefix in page.get('CommonPrefixes', []):
#             directories.add(common_prefix['Prefix'])
    
#     total_directories = len(directories)
#     return directories, total_directories

# bucket_name = 'modis13q1-ndvi'
# prefix = 'Modis data - Kenya/tiff files - Kenya/'

# directories, total_directories = list_and_count_directories(bucket_name, prefix)

# print(f"Total number of directories: {total_directories}")
# for directory in directories:
#     print(directory)

In [None]:
# List the files from the download was interrupted.(run this only once you start downloading the files and you get interrupted)

# file_path = 'Kenya_locations.parquet'
# parquet_file = pq.ParquetFile(file_path)
# row_group = parquet_file.read_row_group(0)
# locations = row_group.to_pandas()
# locations

# locations = locations.iloc[1382:]
# locations

In [None]:
# Print county, subcounty, and wards left to download.

# print(len(locations['county'].unique()))
# print(len(locations['subcounty'].unique()))
# print(len(locations['name'].unique()))

## Functions

In [None]:
# Download tiffs from Earth Engine to local folder

def download_tiffs(coordinates):
    geometry = ee.Geometry.Polygon(coordinates)

    # Define the ImageCollection
    collection = ee.ImageCollection('MODIS/061/MOD13Q1').filter(ee.Filter.date('2000-02-18', '2024-06-30'))

    files = collection.wx.to_tif(
        out_dir=local_folder,
        prefix="wx_",
        region=geometry,
        scale=250,
        crs="EPSG:5070"
    )

    files

In [None]:
# Renaming the files to Dates

def rename_files(local_dir):
    for filename in os.listdir(local_dir):
        if filename.endswith(".tif"):
            year_str = filename.split("_")[4][:4]
            month_str = filename.split("_")[5][:2]
            date_str = filename.split("_")[6][0:2]
            date_obj = date_str + "/" + month_str + "/" + year_str
            date_obj = datetime.strptime(date_obj, "%d/%m/%Y").date()
            new_filename = date_obj.strftime("%d-%m-%Y") + ".tif"
            os.rename(os.path.join(local_dir, filename), os.path.join(local_dir, new_filename))

In [None]:
# uploading the files to S3

def upload_to_s3(local_dir, ward):
    bucket_name = 'modis13q1-ndvi'
    s3_folder = 'Modis data - Kenya/tiff files - Kenya/'
    s3_prefix = f'Modis data - Kenya/tiff files - Kenya/{ward}/'
    files = os.listdir(local_dir)
    for file in files: 
        local_file_path = os.path.join(local_dir, file)
        s3_key = os.path.join(s3_prefix, file)
        s3.upload_file(local_file_path, bucket_name, s3_key)

In [None]:
# PREPROCESSING FUNCTION Raster to dataframe

def get_data(path, date):
    with rasterio.open(path) as src:
        image_array = src.read()
        df = pd.DataFrame(image_array.reshape(-1, src.count), columns=[f'band_{i+1}' for i in range(src.count)])
        df['x'] = [src.xy(row, col)[0] for row in range(src.height) for col in range(src.width)]
        df['y'] = [src.xy(row, col)[1] for row in range(src.height) for col in range(src.width)]
        
    projected_proj = pyproj.Proj(init='epsg:5070')  
    wgs84_proj = pyproj.Proj(init='epsg:4326')
    lon, lat = pyproj.transform(projected_proj, wgs84_proj, df['x'].values, df['y'].values)
    df['latitude'] = lat
    df['longitude'] = lon    
    df['lat_long'] = df['latitude'].astype(str) + ',' + df['longitude'].astype(str)
    df.set_index('lat_long', inplace=True)
    df = df[['band_2']]
    df.rename(columns={"band_2":date}, inplace=True)
    return df

In [None]:
# Execution Script 

s3 = boto3.client('s3')
bucket_name = 'modis13q1-ndvi'

locations['name'] = locations['name'].str.replace('/', '-')
for index,row in locations.iterrows():
    local_folder = "modis_data"
    os.makedirs(local_folder, exist_ok=True)
    folder_name = row['name']+'|'+row['subcounty']+'|'+row['county']
    geojson = json.loads(row['polygon']) 
    coords = geojson['coordinates'][0]
    download_tiffs(coords)
    rename_files(local_folder)
    upload_to_s3(local_folder, folder_name)
    shutil.rmtree(local_folder)

In [None]:
# Metadata edits

file_path = 'Kenya_locations.parquet'
parquet_file = pq.ParquetFile(file_path)
row_group = parquet_file.read_row_group(0)
locations = row_group.to_pandas()

locations['name_2'] = locations['name'].str.replace('/', '-')
locations['s3_directory_name'] = locations['name']+'|'+locations['subcounty']+'|'+locations['county']
locations['root_directory'] = 'modis13q1-ndvi/Modis data - Kenya/'
locations['country'] = 'Kenya'
locations = locations[['country','name','name_2','county','subcounty','polygon','s3_directory_name','root_directory']]
locations['full_path_to_s3_ward'] = locations['root_directory'] +'/'+ locations['s3_directory_name']
locations

In [None]:
# download and upload to S3 bucket

locations.to_csv('Kenya-meta-data.csv')