# CELL 1: MOUNT GOOGLE DRIVE & INSTALL LIBRARIES

In [60]:
# This is the first thing you should always do.

from google.colab import drive
import os

# Mount your Google Drive to the Colab instance.
# You will be prompted to authorize this.
drive.mount('/content/drive')

# Create a dedicated project folder in your Google Drive (if it doesn't exist)
# This keeps your work organized.
PROJECT_DIR = "/content/drive/My Drive/Pungda"
DATA_DIR = os.path.join(PROJECT_DIR, "data")

os.makedirs(DATA_DIR, exist_ok=True)

print(f"Project directory is ready at: {PROJECT_DIR}")

# Install necessary libraries
!pip install rasterio geopandas pandas scikit-learn kaggle

print("\nLibraries installed successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project directory is ready at: /content/drive/My Drive/Pungda

Libraries installed successfully!


# CELL 2: PROJECT CONFIGURATION

In [61]:
# This cell contains all the key parameters and file paths for the project.
# Modify these values to change the behavior of the entire pipeline.

import os

# --- Data Generation Parameters ---
# 500 was good for testing, but 5000 will create a much more robust dataset
# for our deep learning model.
TOP_N_LOCATIONS_PER_CROP = 5000

# Batch size for querying Google Earth Engine to avoid timeouts.
# 1000 is a safe and efficient number.
GEE_BATCH_SIZE = 1000

# --- Input Data Paths & URLs ---
# Kaggle Dataset
CROP_REC_CSV_PATH = os.path.join(DATA_DIR, "Crop_recommendation.csv")

# SPAM 2020 GeoTIFFs
SPAM_YIELD_DIR = os.path.join(DATA_DIR, "spam_yield/spam2020V2r0_global_yield")
SPAM_HARVEST_DIR = os.path.join(DATA_DIR, "spam_harvested_area/spam2020V2r0_global_harvested_area")

# --- Output File Paths (Artifacts of this Pipeline) ---
MASTER_CROP_LIST_PATH = os.path.join(PROJECT_DIR, "master_crop_list.csv")
CROP_VECTORS_PATH = os.path.join(PROJECT_DIR, "crop_requirement_vectors.csv")
FINAL_DATASET_PATH = os.path.join(PROJECT_DIR, "training_dataset_final.csv")

print("✅ Project configuration loaded successfully.")
print(f"Directory for project artifacts: {PROJECT_DIR}")
print(f"Directory for raw data: {DATA_DIR}")
print(f"Points to find per crop: {TOP_N_LOCATIONS_PER_CROP}")

✅ Project configuration loaded successfully.
Directory for project artifacts: /content/drive/My Drive/Pungda
Directory for raw data: /content/drive/My Drive/Pungda/data
Points to find per crop: 5000


# CELL 2: SECURELY CONFIGURE KAGGLE API

In [62]:
# We will use Colab's secret manager for our Kaggle API key.

from google.colab import files
import json

# 1. In Colab, click the "Key" icon on the left sidebar.
# 2. Click "+ Add a new secret".
# 3. For the name, enter "KAGGLE_USERNAME". Enter your Kaggle username as the value.
# 4. Add another secret. Name: "KAGGLE_KEY". Enter the "key" from your kaggle.json file.
# 5. Make sure "Notebook access" is toggled ON for both secrets.

from google.colab import userdata

# Retrieve secrets
kaggle_username = userdata.get('KAGGLE_USERNAME')
kaggle_key = userdata.get('KAGGLE_KEY')

# Create the Kaggle API configuration file in the Colab environment
api_token = {"username": kaggle_username, "key": kaggle_key}
os.makedirs("/root/.kaggle", exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json

print("Kaggle API configured successfully!")

Kaggle API configured successfully!


# CELL 3: DOWNLOAD AND PREPARE DATASETS

In [63]:
# This cell checks if the data exists in your Drive.
# If not, it downloads, unzips, and saves it to your Drive.

# --- Dataset B: Crop Recommendation (from Kaggle) ---

if not os.path.exists(CROP_REC_CSV_PATH):
    print("Crop Recommendation dataset not found in Google Drive. Downloading...")
    !kaggle datasets download -d atharvaingle/crop-recommendation-dataset -p "{DATA_DIR}"
    !unzip -q "{DATA_DIR}/crop-recommendation-dataset.zip" -d "{DATA_DIR}"
    !rm "{DATA_DIR}/crop-recommendation-dataset.zip"
    print("Downloaded and saved to Google Drive.")
else:
    print("Crop Recommendation dataset already exists in Google Drive.")


# --- Dataset A: SPAM 2020 Data (from your Dropbox links) ---

# Corrected Dropbox URL for YIELD data (changed dl=0 to dl=1)
SPAM_YIELD_URL = "https://www.dropbox.com/scl/fi/kajp48kh5wnh65ar2ltbr/spam2020V2r0_global_yield.geotiff.zip?rlkey=n1w5823k0ra9uqqg1tbc18ag4&dl=1"
SPAM_YIELD_ZIP_PATH = os.path.join(DATA_DIR, "spam_yield.zip")

# Corrected Dropbox URL for HARVESTED AREA data (changed dl=0 to dl=1)
SPAM_HARVEST_URL = "https://www.dropbox.com/scl/fi/vgxfy41otygcee89apst0/spam2020V2r0_global_harvested_area.geotiff.zip?rlkey=esz9aoh6f79zorhmv9zwrlnpx&dl=1"
SPAM_HARVEST_ZIP_PATH = os.path.join(DATA_DIR, "spam_harvested_area.zip")

# Download and unzip YIELD data if it doesn't exist
if not os.path.exists(SPAM_YIELD_DIR):
    SPAM_YIELD_DIR = os.path.join(DATA_DIR, "spam_yield")
    print("\nSPAM 2020 Yield dataset not found. Downloading from Dropbox...")
    !wget -O "{SPAM_YIELD_ZIP_PATH}" "{SPAM_YIELD_URL}"
    os.makedirs(SPAM_YIELD_DIR, exist_ok=True)
    !unzip -q "{SPAM_YIELD_ZIP_PATH}" -d "{SPAM_YIELD_DIR}"
    !rm "{SPAM_YIELD_ZIP_PATH}"
    print("SPAM Yield data downloaded and saved to Google Drive.")
    SPAM_YIELD_DIR = os.path.join(DATA_DIR, "spam_yield/spam2020V2r0_global_yield")
else:
    print("\nSPAM 2020 Yield dataset already exists in Google Drive.")

# Download and unzip HARVESTED AREA data if it doesn't exist
if not os.path.exists(SPAM_HARVEST_DIR):
    SPAM_HARVEST_DIR = os.path.join(DATA_DIR, "spam_harvested_area")
    print("\nSPAM 2020 Harvested Area dataset not found. Downloading from Dropbox...")
    !wget -O "{SPAM_HARVEST_ZIP_PATH}" "{SPAM_HARVEST_URL}"
    os.makedirs(SPAM_HARVEST_DIR, exist_ok=True)
    !unzip -q "{SPAM_HARVEST_ZIP_PATH}" -d "{SPAM_HARVEST_DIR}"
    !rm "{SPAM_HARVEST_ZIP_PATH}"
    print("SPAM Harvested Area data downloaded and saved to Google Drive.")
    SPAM_HARVEST_DIR = os.path.join(DATA_DIR, "spam_harvested_area/spam2020V2r0_global_harvested_area")
else:
    print("\nSPAM 2020 Harvested Area dataset already exists in Google Drive.")

Crop Recommendation dataset already exists in Google Drive.

SPAM 2020 Yield dataset already exists in Google Drive.

SPAM 2020 Harvested Area dataset already exists in Google Drive.


# CELL 4: LOAD DATA FROM GOOGLE DRIVE

In [64]:
# Now, we load the data from its permanent home on your Drive.
import pandas as pd

crop_req_df = pd.read_csv(CROP_REC_CSV_PATH)
print("--- Crop Recommendation Data ---")
print(f"Loaded {len(crop_req_df)} rows.")
print(crop_req_df.head())

# List the downloaded SPAM files to confirm success
print("\n--- SPAM 2020 Yield Files (Sample) ---")
print(os.listdir(SPAM_YIELD_DIR)[:5])

print("\n--- SPAM 2020 Harvested Area Files (Sample) ---")
print(os.listdir(SPAM_HARVEST_DIR)[:5])

--- Crop Recommendation Data ---
Loaded 2200 rows.
    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice

--- SPAM 2020 Yield Files (Sample) ---
['spam2020_V2r0_global_Y_BANA_A.tif', 'spam2020_V2r0_global_Y_BANA_I.tif', 'spam2020_V2r0_global_Y_BANA_R.tif', 'spam2020_V2r0_global_Y_BARL_A.tif', 'spam2020_V2r0_global_Y_BARL_I.tif']

--- SPAM 2020 Harvested Area Files (Sample) ---
['spam2020_V2r0_global_H_BANA_A.tif', 'spam2020_V2r0_global_H_BANA_I.tif', 'spam2020_V2r0_global_H_BANA_R.tif', 'spam2020_V2r0_global_H_BARL_A.tif', 'spam2020_V2r0_global_H_BARL_I.tif']


# CELL 5: DEFINE THE TWO VERIFIED CROP LISTS

In [65]:
import pandas as pd
import os

# --- Source 1: The Definitive SPAM Crop List ---
spam_code_to_name = {
    'ACOF': 'arabica coffee', 'BANA': 'banana', 'BARL': 'barley', 'BEAN': 'bean',
    'CASS': 'cassava', 'CHIC': 'chickpea', 'CITR': 'citrus fruit', 'CNUT': 'coconut',
    'COCO': 'cocoa', 'COFF': 'coffee', 'COTT': 'cotton', 'COWP': 'cowpea',
    'GROU': 'groundnut', 'LENT': 'lentil', 'MAIZ': 'maize', 'MILL': 'millet',
    'OCER': 'other cereals', 'OFIB': 'other fibre crops', 'OILP': 'oilpalm',
    'ONIO': 'onion', 'OOIL': 'other oil crops', 'OPUL': 'other pulses',
    'ORTS': 'other roots', 'PIGE': 'pigeonpea', 'PLNT': 'plantain',
    'PMIL': 'pearl millet', 'POTA': 'potato', 'RAPE': 'rapeseed',
    'RCOF': 'robusta coffee', 'REST': 'rest of crops', 'RICE': 'rice',
    'RUBB': 'rubber', 'SESA': 'sesameseed', 'SORG': 'sorghum', 'SOYB': 'soybean',
    'SUGB': 'sugarbeet', 'SUGC': 'sugarcane', 'SUNF': 'sunflower',
    'SWPO': 'sweet potato', 'TEAS': 'tea', 'TEMF': 'temperate fruit',
    'TOBA': 'tobacco', 'TOMA': 'tomato', 'TROF': 'tropical fruit',
    'VEGE': 'vegetables', 'WHEA': 'wheat', 'YAMS': 'yams'
}
spam_crops_df = pd.DataFrame(list(spam_code_to_name.items()), columns=['spam_code', 'spam_name'])
print("--- Loaded 48 SPAM crop types ---")


# --- Source 2: The Verified Kaggle Crop List ---
kaggle_crop_names = [
    'apple', 'banana', 'blackgram', 'chickpea', 'coconut', 'coffee', 'cotton',
    'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango', 'mothbeans',
    'mungbean', 'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'pomegranate',
    'rice', 'watermelon'
]
print(f"--- Loaded {len(kaggle_crop_names)} verified Kaggle crop types ---")

--- Loaded 48 SPAM crop types ---
--- Loaded 22 verified Kaggle crop types ---


# CELL 6: PERFORM THE FINAL "SPAM-FIRST" MAPPING

In [66]:
# Now we perform the final matching using our definitive SPAM list and your
# verified Kaggle list.

!pip install -q thefuzz
from thefuzz import process

def find_best_match(spam_name, choices):
    best_match, score = process.extractOne(spam_name, choices)
    return pd.Series([best_match, score])

# Apply the function to our definitive spam_crops_df
spam_crops_df[['kaggle_name', 'match_score']] = spam_crops_df['spam_name'].apply(find_best_match, args=(kaggle_crop_names,))

# Set our confidence threshold
MASTER_CROP_THRESHOLD = 85
master_crop_df = spam_crops_df[spam_crops_df['match_score'] >= MASTER_CROP_THRESHOLD].copy()

# Add a canonical name for future use
master_crop_df['canonical_name'] = master_crop_df['kaggle_name'].str.replace(' ', '_')

print(f"\n--- Master Crop List (Found {len(master_crop_df)} high-confidence matches) ---")
print("These are the ONLY crops we will build our model for.")
display(master_crop_df)

excluded_crops = spam_crops_df[spam_crops_df['match_score'] < MASTER_CROP_THRESHOLD]
print(f"\n--- Excluded Crops ({len(excluded_crops)} low-confidence or no-rulebook matches) ---")
display(excluded_crops.head())


--- Master Crop List (Found 12 high-confidence matches) ---
These are the ONLY crops we will build our model for.


Unnamed: 0,spam_code,spam_name,kaggle_name,match_score,canonical_name
0,ACOF,arabica coffee,coffee,90,coffee
1,BANA,banana,banana,100,banana
3,BEAN,bean,kidneybeans,90,kidneybeans
5,CHIC,chickpea,chickpea,100,chickpea
7,CNUT,coconut,coconut,100,coconut
9,COFF,coffee,coffee,100,coffee
10,COTT,cotton,cotton,100,cotton
13,LENT,lentil,lentil,100,lentil
14,MAIZ,maize,maize,100,maize
23,PIGE,pigeonpea,pigeonpeas,95,pigeonpeas



--- Excluded Crops (35 low-confidence or no-rulebook matches) ---


Unnamed: 0,spam_code,spam_name,kaggle_name,match_score
2,BARL,barley,apple,55
4,CASS,cassava,banana,46
6,CITR,citrus fruit,jute,51
8,COCO,cocoa,coconut,67
11,COWP,cowpea,pigeonpeas,60


# CELL 7: SAVE THE FINAL MASTER CROP LIST

In [67]:
MASTER_CROP_LIST_PATH = os.path.join(PROJECT_DIR, "master_crop_list.csv")
master_crop_df.to_csv(MASTER_CROP_LIST_PATH, index=False)

print(f"\nDefinitive master crop list successfully saved to your Google Drive at:")
print(MASTER_CROP_LIST_PATH)


Definitive master crop list successfully saved to your Google Drive at:
/content/drive/My Drive/Pungda/master_crop_list.csv


# CELL 8: AUTHENTICATE AND INITIALIZE EARTH ENGINE

In [69]:
# This cell connects our Colab notebook to the Google Earth Engine service.
# You only need to run the authentication part once per session.

import ee
import geemap # A library that makes GEE easier to use in Python

try:
    # This will prompt you to grant access to your Google account.
    # A popup will appear. Follow the steps to get an authorization code and paste it here.
    ee.Authenticate()
    ee.Initialize(project='pungde-477205')
    print("Google Earth Engine authenticated and initialized successfully!")
except Exception as e:
    print(f"Authentication failed. Please make sure you have registered for GEE access at https://earthengine.google.com/signup/")
    print(e)

Google Earth Engine authenticated and initialized successfully!


# CELL 9: DEFINITION - THE ALPHA-EARTH EMBEDDING EXTRACTOR

In [70]:
# This version includes a nested check to handle points that fall on
# NODATA pixels within a valid image tile. This is the final, robust solution.

def get_alphaearth_embeddings(locations_df, year=2020):
    """
    Extracts AlphaEarth embeddings with a two-level check to handle both
    missing image tiles and missing pixels within a tile.
    """
    if locations_df.empty:
        return locations_df

    collection = ee.ImageCollection('GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL') \
                   .filterDate(f'{year}-01-01', f'{year+1}-01-01')

    locations_df['temp_id'] = range(len(locations_df))
    features = [
        ee.Feature(ee.Geometry.Point(lon, lat), {'temp_id': i})
        for i, (lon, lat) in enumerate(zip(locations_df['longitude'], locations_df['latitude']))
    ]
    ee_points = ee.FeatureCollection(features)

    # Define the robust, server-side function
    def sample_point(feature):
        # First, find the specific image tile that covers this point.
        image_for_point = collection.filterBounds(feature.geometry()).first()

        # Define a function to handle the actual sampling.
        # This will only be called if an image is found.
        def perform_sampling(img):
            # Sample the image.
            sampled_feature = ee.Image(img).sample(region=feature.geometry(), scale=10).first()
            # Now, perform a SECOND check to see if the sampling itself was successful.
            return ee.Algorithms.If(
                sampled_feature, # If sampled_feature is not null
                feature.copyProperties(sampled_feature), # Copy the good data
                feature.set('A00', None) # Else, mark it as a failure
            )

        # The main, outer check: Does an image even exist for this point?
        return ee.Algorithms.If(
            image_for_point, # If an image was found...
            perform_sampling(image_for_point), # ...run the robust sampling function.
            feature.set('A00', None) # Else, mark it as a failure.
        )

    # Use .map() to apply this bulletproof function to every point.
    print(f"Requesting embeddings for {len(locations_df)} points with the definitive method...")
    results = ee_points.map(sample_point).getInfo()
    print("...results received.")

    # Process the results
    processed_results = []
    for f in results['features']:
        props = f.get('properties', {})
        # We only want rows where sampling was a success
        if 'A00' in props and props['A00'] is not None:
            processed_results.append(props)

    if not processed_results:
        print("Warning: All points failed to retrieve embeddings even with the new method.")
        return locations_df.assign(**{f'A{i:02d}': None for i in range(64)})

    embedding_df = pd.DataFrame(processed_results)
    final_df = pd.merge(locations_df, embedding_df, on='temp_id', how='left').drop(columns=['temp_id'])

    return final_df

# CELL 10: DEFINITION - THE SPAM DATA PROCESSOR

In [71]:
# This cell defines our robust function for processing SPAM GeoTIFFs.
# It now uses the TOP_N_LOCATIONS_PER_CROP variable from our config cell.

import rasterio
import numpy as np
import pandas as pd

def process_crop_spam_data(canonical_name, spam_code):
    """
    Processes SPAM data for a single crop to find top-yielding locations.
    Reads the number of locations to find from the global config.
    """
    print(f"Processing SPAM data for: {canonical_name} ({spam_code})...")
    filename = f"spam2020_V2r0_global_Y_{spam_code}_A.tif"
    filepath = os.path.join(SPAM_YIELD_DIR, filename)

    if not os.path.exists(filepath):
        print(f"  - WARNING: Could not find yield file '{filename}'. Skipping.")
        return pd.DataFrame()

    try:
        with rasterio.open(filepath) as src:
            yield_data = np.ma.filled(src.read(1, masked=True), 0)
            transform = src.transform

            # Find the coordinates of all pixels with non-zero yield
            rows, cols = np.where(yield_data > 0)

            if len(rows) == 0:
                print(f"  - No yield data > 0 found for {canonical_name}.")
                return pd.DataFrame()

            # Decide how many top locations to sample
            num_to_find = min(TOP_N_LOCATIONS_PER_CROP, len(rows))

            # Get the yield values and indices of these non-zero pixels
            non_zero_yields = yield_data[rows, cols]

            # Find the indices of the top N yields among the non-zero pixels
            # This is more efficient than searching the entire flat array
            top_indices_partitioned = np.argpartition(non_zero_yields, -num_to_find)[-num_to_find:]

            # Get the final top rows, columns, and yield values
            top_rows = rows[top_indices_partitioned]
            top_cols = cols[top_indices_partitioned]
            top_yields = non_zero_yields[top_indices_partitioned]

            # Convert pixel coordinates to longitude and latitude
            longitudes, latitudes = rasterio.transform.xy(transform, top_rows, top_cols)

            locations = pd.DataFrame({
                'canonical_name': canonical_name,
                'longitude': longitudes,
                'latitude': latitudes,
                'yield': top_yields
            })
            print(f"  - Found {len(locations)} high-yield locations.")
            return locations

    except Exception as e:
        print(f"  - ERROR processing file for {canonical_name}: {e}")
        return pd.DataFrame()

# CELL 8: CREATE CROP REQUIREMENT VECTORS

In [74]:
# This version ensures that there is only one unique requirement
# vector per canonical crop name, preventing data duplication later.

import pandas as pd
import os

# --- Load our project files from the config ---
master_crop_df = pd.read_csv(MASTER_CROP_LIST_PATH)
crop_req_df = pd.read_csv(CROP_REC_CSV_PATH)

# --- Filter the recommendation data to only include our master crops ---
master_kaggle_names = master_crop_df['kaggle_name'].unique().tolist()
filtered_req_df = crop_req_df[crop_req_df['label'].isin(master_kaggle_names)].copy()

# --- Calculate the average requirement vector for each crop ---
crop_requirement_vectors = filtered_req_df.groupby('label').mean(numeric_only=True).reset_index()

# --- Merge with our master list to keep everything harmonized ---
final_vectors_df = pd.merge(
    master_crop_df[['canonical_name', 'kaggle_name', 'spam_code']],
    crop_requirement_vectors,
    left_on='kaggle_name',
    right_on='label'
).drop(columns=['label'])

# --- THIS IS THE CRITICAL FIX ---
# Drop duplicates based on the canonical_name to ensure one vector per crop.
# We keep the first instance we encounter.
final_vectors_df = final_vectors_df.drop_duplicates(subset=['canonical_name'], keep='first')


print("\n--- Generated and De-duplicated Crop Requirement Vectors ---")
display(final_vectors_df)

# --- Save the final, corrected vectors to the file ---
final_vectors_df.to_csv(CROP_VECTORS_PATH, index=False)

print(f"\nCorrected crop requirement vectors successfully saved to:")
print(CROP_VECTORS_PATH)


--- Generated and De-duplicated Crop Requirement Vectors ---


Unnamed: 0,canonical_name,kaggle_name,spam_code,N,P,K,temperature,humidity,ph,rainfall
0,coffee,coffee,ACOF,101.2,28.74,29.94,25.540477,58.869846,6.790308,158.066295
1,banana,banana,BANA,100.23,82.01,50.05,27.376798,80.358123,5.983893,104.62698
2,kidneybeans,kidneybeans,BEAN,20.75,67.54,20.05,20.115085,21.605357,5.749411,105.919778
3,chickpea,chickpea,CHIC,40.09,67.79,79.92,18.872847,16.860439,7.336957,80.058977
4,coconut,coconut,CNUT,21.98,16.93,30.59,27.409892,94.844272,5.976562,175.686646
6,cotton,cotton,COTT,117.77,46.24,19.56,23.988958,79.843474,6.912675,80.398043
7,lentil,lentil,LENT,18.77,68.36,19.41,24.509052,64.804785,6.927932,45.680454
8,maize,maize,MAIZ,77.76,48.44,19.79,22.389204,65.092249,6.24519,84.766988
9,pigeonpeas,pigeonpeas,PIGE,20.73,67.73,20.29,27.741762,48.061633,5.794175,149.457564
11,rice,rice,RICE,79.89,47.58,39.87,23.689332,82.272822,6.425471,236.181114



Corrected crop requirement vectors successfully saved to:
/content/drive/My Drive/Pungda/crop_requirement_vectors.csv


# CELL 11: MAIN EXECUTION PIPELINE

In [75]:
# This cell orchestrates the final data assembly pipeline:
# 1. Loads the master list of crops we are targeting.
# 2. Processes SPAM data for each crop to get high-yield locations.
# 3. Fetches AlphaEarth embeddings for all locations in batches.
# 4. Merges the location data with the crop requirement vectors.
# 5. Cleans, re-orders columns, and saves the final model-ready dataset.
# -----------------------------------------------------------------
import time
import numpy as np
import pandas as pd

# --- Step 1: Load the master crop list ---
print("▶️ STEP 1: Loading Master Crop List...")
master_crop_df = pd.read_csv(MASTER_CROP_LIST_PATH)
print(f"Found {len(master_crop_df)} crops to process.")
print("-" * 50)


# --- Step 2: Process all SPAM data to find top locations ---
print(f"▶️ STEP 2: Finding Top {TOP_N_LOCATIONS_PER_CROP} High-Yield Locations per Crop...")
all_locations = [process_crop_spam_data(row['canonical_name'], row['spam_code']) for index, row in master_crop_df.iterrows()]
combined_locations_df = pd.concat(all_locations, ignore_index=True)
print("-------------------------------------------------")
print("Finished processing all SPAM data.")
print(f"Total high-yield locations found: {len(combined_locations_df)}")
print("-" * 50)


# --- Step 3: Fetch AlphaEarth embeddings in batches ---
print("▶️ STEP 3: Fetching AlphaEarth Embeddings in Batches...")
all_results = []
list_of_df_chunks = np.array_split(combined_locations_df,
                                 np.ceil(len(combined_locations_df) / GEE_BATCH_SIZE))

print(f"Splitting {len(combined_locations_df)} locations into {len(list_of_df_chunks)} batches of ~{GEE_BATCH_SIZE} each.")

for i, df_chunk in enumerate(list_of_df_chunks):
    print(f"\nProcessing Batch {i+1} of {len(list_of_df_chunks)}...")
    chunk_with_embeddings = get_alphaearth_embeddings(df_chunk, year=2020)
    all_results.append(chunk_with_embeddings)
    print(f"  - Batch {i+1} complete.")
    time.sleep(1) # Be polite to the GEE servers

locations_with_embeddings = pd.concat(all_results, ignore_index=True)
print("\nAll batches processed successfully!")
print("-" * 50)


# --- Step 4: Final Merge, Clean, and Save ---
print("▶️ STEP 4: Assembling the Final Training Dataset...")

# Clean out any locations that failed to get an embedding
original_count = len(locations_with_embeddings)
cleaned_df = locations_with_embeddings.dropna(subset=['A00']).copy()
print(f"Dropped {original_count - len(cleaned_df)} rows with missing embeddings.")

# Load the requirement vectors
crop_vectors_df = pd.read_csv(CROP_VECTORS_PATH)

# Merge the location data with the crop requirement vectors
final_training_df = pd.merge(cleaned_df, crop_vectors_df, on='canonical_name', how='left')

# Define the final, clean column order
embedding_cols = [f'A{i:02d}' for i in range(64)]
requirement_cols = list(crop_vectors_df.columns.drop(['canonical_name', 'kaggle_name', 'spam_code']))
final_column_order = (
    ['canonical_name', 'yield', 'longitude', 'latitude'] +
    requirement_cols +
    embedding_cols
)
final_training_df = final_training_df[final_column_order]

# Save the final dataset to your Google Drive
final_training_df.to_csv(FINAL_DATASET_PATH, index=False)

print("\n--- Final, Model-Ready Training Dataset ---")
display(final_training_df.head())
print(f"\n✅ SUCCESS! The final training dataset has been saved to:")
print(FINAL_DATASET_PATH)
print(f"Total final rows: {len(final_training_df)}")

▶️ STEP 1: Loading Master Crop List...
Found 12 crops to process.
--------------------------------------------------
▶️ STEP 2: Finding Top 5000 High-Yield Locations per Crop...
Processing SPAM data for: coffee (ACOF)...
Processing SPAM data for: banana (BANA)...
  - Found 5000 high-yield locations.
Processing SPAM data for: kidneybeans (BEAN)...
  - Found 5000 high-yield locations.
Processing SPAM data for: chickpea (CHIC)...
  - Found 5000 high-yield locations.
Processing SPAM data for: coconut (CNUT)...
  - Found 5000 high-yield locations.
Processing SPAM data for: coffee (COFF)...
  - Found 5000 high-yield locations.
Processing SPAM data for: cotton (COTT)...
  - Found 5000 high-yield locations.
Processing SPAM data for: lentil (LENT)...
  - Found 5000 high-yield locations.
Processing SPAM data for: maize (MAIZ)...
  - Found 5000 high-yield locations.
Processing SPAM data for: pigeonpeas (PIGE)...
  - Found 5000 high-yield locations.
Processing SPAM data for: coffee (RCOF)...
  - F

  return bound(*args, **kwds)


...results received.
  - Batch 1 complete.

Processing Batch 2 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 2 complete.

Processing Batch 3 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 3 complete.

Processing Batch 4 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 4 complete.

Processing Batch 5 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 5 complete.

Processing Batch 6 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 6 complete.

Processing Batch 7 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 7 complete.

Processing Batch 8 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.



...results received.
  - Batch 34 complete.

Processing Batch 35 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 35 complete.

Processing Batch 36 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 36 complete.

Processing Batch 37 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 37 complete.

Processing Batch 38 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 38 complete.

Processing Batch 39 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 39 complete.

Processing Batch 40 of 55...
Requesting embeddings for 1000 points with the definitive method...
...results received.
  - Batch 40 complete.

Processing Batch 41 of 55...
Requesting embeddings for 1000 points with the definitive method...
...res

Unnamed: 0,canonical_name,yield,longitude,latitude,N,P,K,temperature,humidity,ph,...,A54,A55,A56,A57,A58,A59,A60,A61,A62,A63
0,banana,95.800003,126.458333,-18.625,100.23,82.01,50.05,27.376798,80.358123,5.983893,...,0.051734,-0.044844,0.003937,-0.093564,0.002215,0.012057,0.318893,0.214133,0.044844,-0.007443
1,banana,95.800003,126.541667,-18.625,100.23,82.01,50.05,27.376798,80.358123,5.983893,...,0.062991,-0.059116,0.035433,-0.088827,0.010396,0.027128,0.27614,0.16,-0.004983,-0.079723
2,banana,95.800003,126.625,-18.625,100.23,82.01,50.05,27.376798,80.358123,5.983893,...,0.06699,-0.044844,0.044844,-0.084214,0.007443,0.048228,0.310096,0.199862,0.007443,-0.022207
3,banana,95.800003,126.708333,-18.625,100.23,82.01,50.05,27.376798,80.358123,5.983893,...,0.16,0.019931,0.084214,-0.075356,-0.022207,-0.029773,0.236463,0.124567,-0.071111,-0.062991
4,banana,95.800003,126.791667,-18.625,100.23,82.01,50.05,27.376798,80.358123,5.983893,...,0.103406,0.029773,0.048228,-0.055363,0.006151,0.055363,0.179377,0.103406,-0.093564,-0.124567



✅ SUCCESS! The final training dataset has been saved to:
/content/drive/My Drive/Pungda/training_dataset_final.csv
Total final rows: 54923


# CELL 14: CREATE AND SAVE DATA SCALERS FOR LATER USE

In [77]:
# This is the final step in the data engineering notebook.
# We will create an instance of a PyTorch Dataset class. The main purpose
# of this is to properly calculate the scaling parameters (mean, std, min, max)
# from our training data and save them.
# The new model training notebook will then load these exact scalers.
# This ensures that our data is processed consistently everywhere.
# -----------------------------------------------------------------
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

print("▶️ Creating data scalers...")

SCALERS_PATH = os.path.join(PROJECT_DIR, "scalers.joblib")

# --- Load the final dataset ---
df = pd.read_csv(FINAL_DATASET_PATH)

# --- Define feature columns ---
requirement_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
embedding_cols = [f'A{i:02d}' for i in range(64)]
target_col = 'yield'

# --- Split data to identify the training set ---
# We ONLY fit scalers on the training data to prevent data leakage.
train_df, _ = train_test_split(df, test_size=0.2, random_state=42)

# --- Fit the scalers ---
req_scaler = StandardScaler()
emb_scaler = StandardScaler()
yield_scaler = MinMaxScaler()

req_scaler.fit(train_df[requirement_cols])
emb_scaler.fit(train_df[embedding_cols])
yield_scaler.fit(train_df[[target_col]])

# --- Bundle the scalers into a dictionary ---
scalers = {
    'req': req_scaler,
    'emb': emb_scaler,
    'yield': yield_scaler
}

# --- Save the scalers object to a file ---
joblib.dump(scalers, SCALERS_PATH)

print(f"\n✅ SUCCESS! The data scalers have been calculated and saved to:")
print(SCALERS_PATH)

▶️ Creating data scalers...

✅ SUCCESS! The data scalers have been calculated and saved to:
/content/drive/My Drive/Pungda/scalers.joblib
