In [1]:
import pandas as pd
import numpy as np

import pandas as pd
from functools import reduce

In [2]:
nasa_df = pd.read_parquet('nasa_df.parquet')

In [3]:
label_yield = pd.read_parquet("label_yield.parquet")

In [4]:
label_yield['item'] = label_yield['item'].str.replace(r'[^0-9a-zA-Z ]', '', regex=True)
label_yield['item'] = label_yield['item'].str.replace(" ", "_").str.lower()


In [5]:
crop_list = list(label_yield['item'].unique())

In [6]:
crop_list

['maize_corn',
 'other_vegetables_fresh_nec',
 'potatoes',
 'rice',
 'sugar_cane',
 'wheat',
 'oil_palm_fruit',
 'barley',
 'soya_beans',
 'sugar_beet',
 'watermelons',
 'cucumbers_and_gherkins',
 'tomatoes',
 'bananas',
 'cassava_fresh']

In [7]:
import pandas as pd


# Function to compute past N-year average
def past_n_year_avg(group, n):
    res = []
    for i, row in group.iterrows():
        current_year = row['year']
        past_years = list(range(current_year - n, current_year))  # past N years
        avg = group.loc[group['year'].isin(past_years), 'label'].mean()
        res.append(avg)
    return pd.Series(res, index=group.index)


def prep_feature_crop_lag_1_3_6_by_type(df, crop_type):
    df = df[df['item']==crop_type]
    # Make sure year is integer
    df['year'] = pd.to_datetime(df['year']).dt.year
    
    # Sort
    df = df.sort_values(['area', 'item', 'year'])
    
    # Apply per group
    df[f'avg_yield_{crop_type}_1y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 1))
    df[f'avg_yield_{crop_type}_3y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 3))
    df[f'avg_yield_{crop_type}_6y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 6))
    df = df[['year', 'area', f'avg_yield_{crop_type}_1y', f'avg_yield_{crop_type}_3y', f'avg_yield_{crop_type}_6y']]
    return df


In [8]:
dfs = []
for crop_type in crop_list:
    dfs.append(prep_feature_crop_lag_1_3_6_by_type(df=label_yield, crop_type=crop_type))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = pd.to_datetime(df['year']).dt.year
  df[f'avg_yield_{crop_type}_1y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 1))
  df[f'avg_yield_{crop_type}_3y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 3))
  df[f'avg_yield_{crop_type}_6y'] = df.groupby(['area', 'item'], group_keys=False).apply(lambda g: past_n_year_avg(g, 6))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = pd.to_datetime(df['year']).dt.year
  df[f'a

In [9]:
len(dfs)

15

In [10]:

# Sequential left join
features_lag_yield = reduce(
    lambda left, right: pd.merge(left, right, how='left', on=['year', 'area']),
    dfs
)



In [13]:

def prep_rain_features_sum_lag1year(nasa_df):
    nasa_df['year'] = nasa_df['date'].dt.year
    nasa_df['month'] = nasa_df['date'].dt.month
    features_rain = nasa_df.pivot_table(
        index=['area','year'],
        columns='month',
        values='rain'
    ).reset_index()
    
    
    month_map = {i: f'rain_{pd.Timestamp(1900,i,1).strftime("%b")}' for i in range(1,13)}
    features_rain = features_rain.rename(columns=month_map)
    features_rain['year'] = features_rain['year']+1
    
    features_rain.columns = ['area', 'year', 'rain_Jan', 'rain_Feb', 'rain_Mar', 'rain_Apr',
           'rain_May', 'rain_Jun', 'rain_Jul', 'rain_Aug', 'rain_Sep', 'rain_Oct',
           'rain_Nov', 'rain_Dec']
    
    features_rain['sum_rain_1_3'] = features_rain[['rain_Jan','rain_Feb','rain_Mar']].sum(axis=1)
    features_rain['sum_rain_3_6'] = features_rain[['rain_Apr','rain_May','rain_Jun',]].sum(axis=1)
    features_rain['sum_rain_6_9'] = features_rain[['rain_Jul','rain_Aug','rain_Sep']].sum(axis=1)
    features_rain['sum_rain_10_12'] = features_rain[['rain_Oct','rain_Nov','rain_Dec']].sum(axis=1)
    features_rain['sum_rain_1_12'] = features_rain[[f'rain_{m}' for m in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]].sum(axis=1)
    features_rain  = features_rain[['area', 'year', 'sum_rain_1_3', 'sum_rain_3_6', 'sum_rain_6_9', 'sum_rain_10_12', 'sum_rain_1_12']]
    return features_rain


In [14]:
import pandas as pd

def prep_monthly_features_avg_nasa_lag1year(nasa_df, var_list=['rain' ,'solar','temp']):
    # Ensure date is datetime
    nasa_df['date'] = pd.to_datetime(nasa_df['date'])
    nasa_df['year'] = nasa_df['date'].dt.year
    nasa_df['month'] = nasa_df['date'].dt.month
    
    all_features = None
    
    for var in var_list:
        # Pivot months
        df_pivot = nasa_df.pivot_table(
            index=['area','year'],
            columns='month',
            values=var
        ).reset_index()
        
        # Rename months
        month_map = {i: f'{var}_{pd.Timestamp(1900,i,1).strftime("%b")}' for i in range(1,13)}
        df_pivot = df_pivot.rename(columns=month_map)
        
        # Shift year for lag-1
        df_pivot['year'] = df_pivot['year'] + 1
        
        # Ensure all months exist
        for month in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']:
            col = f'{var}_{month}'
            if col not in df_pivot.columns:
                df_pivot[col] = pd.NA  # Use NA for averages
        
        # Seasonal / period averages
        df_pivot[f'avg_{var}_1_3'] = df_pivot[[f'{var}_{m}' for m in ['Jan','Feb','Mar']]].mean(axis=1)
        df_pivot[f'avg_{var}_3_6'] = df_pivot[[f'{var}_{m}' for m in ['Apr','May','Jun']]].mean(axis=1)
        df_pivot[f'avg_{var}_6_9'] = df_pivot[[f'{var}_{m}' for m in ['Jul','Aug','Sep']]].mean(axis=1)
        df_pivot[f'avg_{var}_10_12'] = df_pivot[[f'{var}_{m}' for m in ['Oct','Nov','Dec']]].mean(axis=1)
        df_pivot[f'avg_{var}_1_12'] = df_pivot[[f'{var}_{m}' for m in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]].mean(axis=1)
        
        # Reorder columns
        cols_order = ['area','year'] + [f'{var}_{m}' for m in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] + \
                     [f'avg_{var}_1_3', f'avg_{var}_3_6', f'avg_{var}_6_9', f'avg_{var}_10_12', f'avg_{var}_1_12']
        df_pivot = df_pivot[cols_order]
        
        # Merge with previous variables
        if all_features is None:
            all_features = df_pivot
        else:
            all_features = all_features.merge(df_pivot, on=['area','year'], how='outer')
    
    return all_features


In [22]:
features_avg_nasa_all_lag1year = prep_monthly_features_avg_nasa_lag1year(nasa_df, var_list=['rain' ,'solar','temp'])
features_sum_nasa_rain_lag1year = prep_rain_features_sum_lag1year(nasa_df)


nasa_f = features_avg_nasa_all_lag1year.merge(
    features_sum_nasa_rain_lag1year, on=['year', 'area'], how='inner'
)

In [23]:
nasa_f

Unnamed: 0,area,year,rain_Jan,rain_Feb,rain_Mar,rain_Apr,rain_May,rain_Jun,rain_Jul,rain_Aug,...,avg_temp_1_3,avg_temp_3_6,avg_temp_6_9,avg_temp_10_12,avg_temp_1_12,sum_rain_1_3,sum_rain_3_6,sum_rain_6_9,sum_rain_10_12,sum_rain_1_12
0,Afghanistan,1982,55.53,85.20,66.13,23.64,23.92,3.15,8.01,23.28,...,2.066667,17.206667,20.716667,6.806667,11.699167,206.86,50.71,35.80,31.95,325.32
1,Afghanistan,1983,53.21,67.57,86.11,19.08,67.75,0.62,1.90,1.13,...,0.106667,15.680000,20.790000,6.026667,10.650833,206.89,87.45,3.43,75.77,373.54
2,Afghanistan,1984,16.04,35.36,112.00,60.29,30.44,2.00,1.47,5.87,...,1.400000,14.933333,22.253333,6.946667,11.383333,163.40,92.73,7.70,9.42,273.25
3,Afghanistan,1985,17.96,32.74,53.33,33.13,2.71,0.39,8.62,8.23,...,1.253333,17.946667,22.230000,6.123333,11.888333,104.03,36.23,20.79,35.05,196.10
4,Afghanistan,1986,16.15,14.74,15.16,42.84,7.67,0.17,3.38,1.89,...,4.220000,17.503333,21.590000,6.733333,12.511667,46.05,50.68,6.22,52.14,155.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8982,Zimbabwe,2020,92.82,116.91,39.82,22.96,12.33,3.78,0.01,0.36,...,23.883333,19.033333,19.203333,26.340000,22.115000,249.55,39.07,1.13,126.03,415.78
8983,Zimbabwe,2021,108.77,173.37,29.36,18.51,2.09,1.76,0.75,0.88,...,23.646667,18.716667,18.666667,24.313333,21.335833,311.50,22.36,6.17,456.13,796.16
8984,Zimbabwe,2022,203.51,151.02,31.73,9.11,2.35,1.77,1.30,1.61,...,21.086667,16.763333,18.136667,24.666667,20.163333,386.26,13.23,4.57,280.52,684.58
8985,Zimbabwe,2023,290.33,16.13,97.53,102.54,7.69,5.37,2.51,2.32,...,22.260000,17.136667,18.493333,23.976667,20.466667,403.99,115.60,5.81,247.90,773.30


In [24]:
x_features = features_lag_yield.merge(
    nasa_f, on=['year', 'area'], how='left'
)

In [26]:
x_features = x_features[x_features['year']>=1983]

In [29]:
x_features.to_parquet('x_features.parquet')

In [28]:
x_features

Unnamed: 0,year,area,avg_yield_maize_corn_1y,avg_yield_maize_corn_3y,avg_yield_maize_corn_6y,avg_yield_other_vegetables_fresh_nec_1y,avg_yield_other_vegetables_fresh_nec_3y,avg_yield_other_vegetables_fresh_nec_6y,avg_yield_potatoes_1y,avg_yield_potatoes_3y,...,avg_temp_1_3,avg_temp_3_6,avg_temp_6_9,avg_temp_10_12,avg_temp_1_12,sum_rain_1_3,sum_rain_3_6,sum_rain_6_9,sum_rain_10_12,sum_rain_1_12
13,1983,Afghanistan,1665.8,1668.633333,1636.283333,6919.2,6846.166667,6561.216667,15511.4,15265.133333,...,0.106667,15.680000,20.790000,6.026667,10.650833,206.89,87.45,3.43,75.77,373.54
14,1984,Afghanistan,1664.1,1666.300000,1649.750000,7065.7,6959.033333,6775.366667,15764.7,15566.600000,...,1.400000,14.933333,22.253333,6.946667,11.383333,163.40,92.73,7.70,9.42,273.25
15,1985,Afghanistan,1661.2,1663.700000,1656.900000,7155.1,7046.666667,6897.800000,14444.4,15240.166667,...,1.253333,17.946667,22.230000,6.123333,11.888333,104.03,36.23,20.79,35.05,196.10
16,1986,Afghanistan,1665.2,1663.500000,1666.066667,7145.9,7122.233333,6984.200000,14090.9,14766.666667,...,4.220000,17.503333,21.590000,6.733333,12.511667,46.05,50.68,6.22,52.14,155.09
17,1987,Afghanistan,1687.5,1671.300000,1668.800000,7249.5,7183.500000,7071.266667,15866.7,14800.666667,...,1.846667,16.216667,20.453333,6.510000,11.256667,143.25,61.88,46.44,59.63,311.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8299,2019,Zimbabwe,1350.6,1061.466667,906.616667,41264.7,40435.466667,39208.000000,24000.0,27643.033333,...,22.586667,17.703333,19.333333,24.610000,21.058333,440.72,34.19,4.35,98.55,577.81
8300,2020,Zimbabwe,780.7,1174.866667,912.583333,47668.4,43122.366667,40956.600000,19000.0,22300.000000,...,23.883333,19.033333,19.203333,26.340000,22.115000,249.55,39.07,1.13,126.03,415.78
8301,2021,Zimbabwe,1180.1,1103.800000,954.250000,40083.3,43005.466667,41307.666667,23000.0,22000.000000,...,23.646667,18.716667,18.666667,24.313333,21.335833,311.50,22.36,6.17,456.13,796.16
8302,2022,Zimbabwe,1534.4,1165.066667,1113.266667,39717.5,42489.733333,41462.600000,21000.0,21000.000000,...,21.086667,16.763333,18.136667,24.666667,20.163333,386.26,13.23,4.57,280.52,684.58
