# Getting Soil Grid Data

In [None]:
import pandas as pd
import requests
import json
import time
import random
import os
import concurrent.futures
from tqdm import tqdm
import threading

# Add a lock to prevent race conditions when saving data
save_lock = threading.Lock()

def get_soilgrids_point(lon, lat, point_idx, properties=None, max_retries=3):
    """Get SoilGrids data for a single point with correct field mapping"""
    if properties is None:
        properties = ['soc', 'clay', 'sand', 'silt', 'bdod', 'phh2o']
        
    url = "https://rest.isric.org/soilgrids/v2.0/properties/query"
    params = {
        'lon': lon,
        'lat': lat,
        'property': properties,
        'depth': ['0-5cm', '5-15cm', '15-30cm'],
        'value': ['mean']
    }
    
    for retry in range(max_retries):
        try:
            response = requests.get(url, params=params)
            
            # Handle rate limiting
            if response.status_code == 429:
                wait_time = 15 + random.random() * 15
                #print(f"Rate limited for point {point_idx}, waiting {wait_time:.1f} seconds")
                time.sleep(wait_time)
                continue
                
            if response.status_code == 200:
                data = response.json()
                
                # Start with basic info
                result = {'point_index': point_idx, 'lon': lon, 'lat': lat}
                
                # Extract data using the correct field structure
                if 'properties' in data and 'layers' in data['properties']:
                    for layer in data['properties']['layers']:
                        # Get property name
                        prop_name = layer.get('name', 'unknown')
                        
                        for depth in layer.get('depths', []):
                            # Get depth label (which is the string format we need)
                            depth_label = depth.get('label', 'unknown')
                            
                            # Clean the depth label for column naming
                            clean_depth = depth_label.replace('-', '_to_')
                            
                            # Extract values
                            for value_type, value in depth.get('values', {}).items():
                                column_name = f"{prop_name}_{clean_depth}_{value_type}"
                                result[column_name] = value
                
                # Debug print to verify data is being captured correctly
                #print(f"Retrieved data for point {point_idx}: {lon}, {lat}")
                return result
            else:
                #print(f"Error for point {point_idx}: Status code {response.status_code}")
                if retry < max_retries - 1:
                    wait_time = 10 * (retry + 1)
                    #print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    return {'point_index': point_idx, 'lon': lon, 'lat': lat, 
                            'error': f"Status {response.status_code}"}
        
        except Exception as e:
            #print(f"Exception for point {point_idx}: {str(e)}")
            if retry < max_retries - 1:
                wait_time = 10 * (retry + 1)
                #print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                return {'point_index': point_idx, 'lon': lon, 'lat': lat, 
                        'error': f"Exception: {str(e)}"}
    
    return {'point_index': point_idx, 'lon': lon, 'lat': lat, 
            'error': "Max retries reached"}

def process_point(args):
    """Wrapper function for concurrent processing"""
    lon, lat, idx, properties = args
    # Add jitter to avoid all workers hitting the API simultaneously
    time.sleep(random.random() * 2)
    return get_soilgrids_point(lon, lat, idx, properties)

def save_checkpoint(results, filename, verbose=True):
    """Save results to a checkpoint file using a lock to prevent race conditions"""
    with save_lock:
        try:
            df_results = pd.DataFrame(results)
            # First write to a temporary file, then rename to avoid partial writes
            temp_file = f"{filename}.temp"
            df_results.to_csv(temp_file, index=False)
            os.replace(temp_file, filename)
            if verbose:
                print(f"Saved checkpoint with {len(results)} points to {filename}")
        except Exception as e:
            print(f"Error saving checkpoint: {str(e)}")

def get_soilgrids_parallel(coordinates_df, num_workers=4, lon_col='GPS_LONG', lat_col='GPS_LAT', 
                           properties=None, cache_file='soilgrids_parallel.csv',
                           checkpoint_interval=10, debug=True):
    """
    Retrieve soil data for multiple points in parallel using multiple workers
    
    Args:
        coordinates_df: DataFrame with coordinates
        num_workers: Number of parallel workers (default: 4)
        lon_col: Column name for longitude
        lat_col: Column name for latitude
        properties: List of SoilGrids properties to retrieve
        cache_file: Output file name
        checkpoint_interval: Save intermediate results every N points
        debug: Enable additional debug output
    """
    if properties is None:
        properties = ['soc', 'clay', 'sand', 'silt', 'bdod', 'phh2o']
    
    # Print the input data to verify it's correct
    if debug:
        print("Input coordinate data sample:")
        print(coordinates_df.head())
        print(f"Longitude column: {lon_col}, Latitude column: {lat_col}")
    
    # Check for existing cache to resume from
    results = []
    
    if os.path.exists(cache_file):
        try:
            existing_df = pd.read_csv(cache_file)
            if len(existing_df) > 0:
                results = existing_df.to_dict('records')
                processed_indices = set(existing_df['point_index'].unique())
                print(f"Found {len(processed_indices)} already processed points in {cache_file}")
                coordinates_df = coordinates_df[~coordinates_df.index.isin(processed_indices)]
                print(f"Remaining points to process: {len(coordinates_df)}")
        except Exception as e:
            print(f"Error reading existing cache: {str(e)}. Starting from scratch.")
    
    if len(coordinates_df) == 0:
        print("All points already processed!")
        return pd.DataFrame(results)
    
    # Prepare arguments for parallel processing
    args_list = []
    for idx, row in coordinates_df.iterrows():
        # Verify and clean coordinate values
        try:
            lon = float(row[lon_col])
            lat = float(row[lat_col])
            args_list.append((lon, lat, idx, properties))
            if debug and len(args_list) <= 5:
                print(f"Prepared point {idx}: lon={lon}, lat={lat}")
        except (ValueError, TypeError) as e:
            print(f"Error with coordinates at index {idx}: {e}")
            print(f"Row data: {row}")
    
    print(f"Processing {len(args_list)} points with {num_workers} workers")
    
    completed_count = 0
    
    # Use ThreadPoolExecutor for parallel HTTP requests
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all tasks
        future_to_args = {executor.submit(process_point, args): args for args in args_list}
        
        # Use tqdm for a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_args), total=len(args_list)):
            args = future_to_args[future]
            point_idx = args[2]
            
            try:
                result = future.result()
                if result:
                    results.append(result)
                    completed_count += 1
                    
                    # Save intermediate results periodically
                    if completed_count % checkpoint_interval == 0:
                        save_checkpoint(results, cache_file)
                
            except Exception as e:
                print(f"\nError processing point {point_idx}: {str(e)}")
    
    # Save final results
    save_checkpoint(results, cache_file, verbose=False)
    
    # Verify the final output
    try:
        final_df = pd.read_csv(cache_file)
        print(f"Final output has {len(final_df)} rows and {len(final_df.columns)} columns")
        print("Column names:", final_df.columns.tolist())
        print("First few rows:")
        print(final_df.head())
    except Exception as e:
        print(f"Error verifying final output: {str(e)}")
    
    return pd.DataFrame(results)

# Example usage:
# df = pd.read_csv('coordinates.csv', index_col=0)  # Set the first column as index if that's your point_index
# results = get_soilgrids_parallel(df, num_workers=4)

In [None]:
# Load your data
target_raw = pd.read_csv('data/France_lab.csv')
long_lat = target_raw[['GPS_LONG', 'GPS_LAT']]
get_soilgrids_parallel(long_lat, num_workers=4, properties=['soc', 'clay', 'sand', 'silt', 'bdod', 'phh2o'], debug = False)

Found 30 already processed points in soilgrids_parallel.csv
Remaining points to process: 2777
Processing 2777 points with 4 workers


  0%|          | 10/2777 [00:50<5:09:59,  6.72s/it]

Saved checkpoint with 40 points to soilgrids_parallel.csv


  1%|          | 20/2777 [01:04<56:41,  1.23s/it]  

Saved checkpoint with 50 points to soilgrids_parallel.csv


  1%|          | 30/2777 [01:13<37:26,  1.22it/s]  

Saved checkpoint with 60 points to soilgrids_parallel.csv


  1%|▏         | 40/2777 [02:15<2:35:14,  3.40s/it]

Saved checkpoint with 70 points to soilgrids_parallel.csv


  2%|▏         | 50/2777 [02:28<1:05:40,  1.45s/it]

Saved checkpoint with 80 points to soilgrids_parallel.csv


  2%|▏         | 60/2777 [03:01<2:21:07,  3.12s/it]

Saved checkpoint with 90 points to soilgrids_parallel.csv


  3%|▎         | 70/2777 [03:20<1:15:20,  1.67s/it]

Saved checkpoint with 100 points to soilgrids_parallel.csv


  3%|▎         | 79/2777 [03:29<49:23,  1.10s/it]  

Saved checkpoint with 110 points to soilgrids_parallel.csv


  3%|▎         | 91/2777 [04:05<1:03:29,  1.42s/it]

Saved checkpoint with 120 points to soilgrids_parallel.csv


  4%|▎         | 100/2777 [04:34<1:31:36,  2.05s/it]

Saved checkpoint with 130 points to soilgrids_parallel.csv


  4%|▍         | 110/2777 [04:59<3:54:43,  5.28s/it]

Saved checkpoint with 140 points to soilgrids_parallel.csv


  4%|▍         | 120/2777 [05:15<1:22:47,  1.87s/it]

Saved checkpoint with 150 points to soilgrids_parallel.csv


  5%|▍         | 130/2777 [05:45<54:45,  1.24s/it]  

Saved checkpoint with 160 points to soilgrids_parallel.csv


  5%|▌         | 140/2777 [06:16<3:12:27,  4.38s/it]

Saved checkpoint with 170 points to soilgrids_parallel.csv


  5%|▌         | 150/2777 [06:46<1:19:39,  1.82s/it]

Saved checkpoint with 180 points to soilgrids_parallel.csv


  6%|▌         | 160/2777 [06:56<53:31,  1.23s/it]  

Saved checkpoint with 190 points to soilgrids_parallel.csv


  6%|▌         | 170/2777 [07:52<1:56:46,  2.69s/it]

Saved checkpoint with 200 points to soilgrids_parallel.csv


  6%|▋         | 180/2777 [08:02<32:22,  1.34it/s]  

Saved checkpoint with 210 points to soilgrids_parallel.csv


  7%|▋         | 190/2777 [08:46<5:39:19,  7.87s/it]

Saved checkpoint with 220 points to soilgrids_parallel.csv


  7%|▋         | 200/2777 [08:58<46:12,  1.08s/it]  

Saved checkpoint with 230 points to soilgrids_parallel.csv


  8%|▊         | 210/2777 [09:17<1:45:14,  2.46s/it]

Saved checkpoint with 240 points to soilgrids_parallel.csv


  8%|▊         | 220/2777 [10:02<3:17:58,  4.65s/it]

Saved checkpoint with 250 points to soilgrids_parallel.csv


  8%|▊         | 230/2777 [10:33<2:30:56,  3.56s/it]

Saved checkpoint with 260 points to soilgrids_parallel.csv


  9%|▊         | 240/2777 [11:05<1:22:47,  1.96s/it]

Saved checkpoint with 270 points to soilgrids_parallel.csv


  9%|▉         | 250/2777 [11:50<5:38:05,  8.03s/it]

Saved checkpoint with 280 points to soilgrids_parallel.csv


  9%|▉         | 260/2777 [12:21<58:23,  1.39s/it]  

Saved checkpoint with 290 points to soilgrids_parallel.csv


 10%|▉         | 269/2777 [12:28<37:41,  1.11it/s]

Saved checkpoint with 300 points to soilgrids_parallel.csv


 10%|█         | 280/2777 [13:27<5:59:57,  8.65s/it]

Saved checkpoint with 310 points to soilgrids_parallel.csv


 10%|█         | 290/2777 [13:36<43:54,  1.06s/it]  

Saved checkpoint with 320 points to soilgrids_parallel.csv


 11%|█         | 300/2777 [13:44<28:31,  1.45it/s]

Saved checkpoint with 330 points to soilgrids_parallel.csv


 11%|█         | 310/2777 [14:31<3:38:09,  5.31s/it]

Saved checkpoint with 340 points to soilgrids_parallel.csv


 12%|█▏        | 320/2777 [14:47<55:29,  1.36s/it]  

Saved checkpoint with 350 points to soilgrids_parallel.csv


 12%|█▏        | 330/2777 [15:11<1:34:40,  2.32s/it]

Saved checkpoint with 360 points to soilgrids_parallel.csv


 12%|█▏        | 340/2777 [15:46<59:17,  1.46s/it]  

Saved checkpoint with 370 points to soilgrids_parallel.csv


 13%|█▎        | 350/2777 [16:41<6:28:18,  9.60s/it]

Saved checkpoint with 380 points to soilgrids_parallel.csv


 13%|█▎        | 360/2777 [16:48<37:06,  1.09it/s]  

Saved checkpoint with 390 points to soilgrids_parallel.csv


 13%|█▎        | 370/2777 [17:53<4:13:05,  6.31s/it]

Saved checkpoint with 400 points to soilgrids_parallel.csv


 14%|█▎        | 380/2777 [17:59<31:03,  1.29it/s]  

Saved checkpoint with 410 points to soilgrids_parallel.csv


 14%|█▍        | 390/2777 [18:07<46:47,  1.18s/it]

Saved checkpoint with 420 points to soilgrids_parallel.csv


 14%|█▍        | 400/2777 [19:01<2:25:34,  3.67s/it]

Saved checkpoint with 430 points to soilgrids_parallel.csv


 15%|█▍        | 410/2777 [19:11<45:56,  1.16s/it]  

Saved checkpoint with 440 points to soilgrids_parallel.csv


 15%|█▌        | 420/2777 [19:56<4:56:05,  7.54s/it]

Saved checkpoint with 450 points to soilgrids_parallel.csv


 15%|█▌        | 430/2777 [20:25<53:04,  1.36s/it]  

Saved checkpoint with 460 points to soilgrids_parallel.csv


 16%|█▌        | 440/2777 [21:20<3:32:53,  5.47s/it]

Saved checkpoint with 470 points to soilgrids_parallel.csv


 16%|█▌        | 450/2777 [21:28<30:27,  1.27it/s]  

Saved checkpoint with 480 points to soilgrids_parallel.csv


 17%|█▋        | 460/2777 [21:35<26:09,  1.48it/s]

Saved checkpoint with 490 points to soilgrids_parallel.csv


 17%|█▋        | 470/2777 [22:26<1:17:41,  2.02s/it]

Saved checkpoint with 500 points to soilgrids_parallel.csv


 17%|█▋        | 480/2777 [22:32<24:36,  1.56it/s]  

Saved checkpoint with 510 points to soilgrids_parallel.csv


 18%|█▊        | 490/2777 [23:33<1:22:41,  2.17s/it]

Saved checkpoint with 520 points to soilgrids_parallel.csv


 18%|█▊        | 500/2777 [23:39<23:17,  1.63it/s]  

Saved checkpoint with 530 points to soilgrids_parallel.csv


 18%|█▊        | 510/2777 [24:29<3:29:44,  5.55s/it]

Saved checkpoint with 540 points to soilgrids_parallel.csv


 19%|█▊        | 520/2777 [24:36<33:25,  1.13it/s]  

Saved checkpoint with 550 points to soilgrids_parallel.csv


 19%|█▉        | 530/2777 [25:39<6:09:57,  9.88s/it]

Saved checkpoint with 560 points to soilgrids_parallel.csv


 19%|█▉        | 539/2777 [25:54<1:02:10,  1.67s/it]

Saved checkpoint with 570 points to soilgrids_parallel.csv


 20%|█▉        | 550/2777 [26:47<7:04:21, 11.43s/it]

Saved checkpoint with 580 points to soilgrids_parallel.csv


 20%|██        | 560/2777 [27:00<54:47,  1.48s/it]  

Saved checkpoint with 590 points to soilgrids_parallel.csv


 21%|██        | 570/2777 [27:08<29:49,  1.23it/s]

Saved checkpoint with 600 points to soilgrids_parallel.csv


 21%|██        | 580/2777 [28:00<5:07:27,  8.40s/it]

Saved checkpoint with 610 points to soilgrids_parallel.csv


 21%|██        | 590/2777 [28:13<37:31,  1.03s/it]  

Saved checkpoint with 620 points to soilgrids_parallel.csv


 22%|██▏       | 600/2777 [28:28<55:15,  1.52s/it]  

Saved checkpoint with 630 points to soilgrids_parallel.csv


 22%|██▏       | 609/2777 [29:26<1:15:41,  2.09s/it]

Saved checkpoint with 640 points to soilgrids_parallel.csv


 22%|██▏       | 620/2777 [29:33<27:17,  1.32it/s]  

Saved checkpoint with 650 points to soilgrids_parallel.csv


 23%|██▎       | 627/2777 [30:28<1:44:29,  2.92s/it]
