# 2_Feature_Engineering.ipynb

Create ML-ready dataset from merged_final_encoded.csv (or merged_final.csv). Imports utils.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from utils import read_csv_fallback, normalize_columns, ensure_month_features, ensure_season_features, detect_target_column
ROOT = Path.cwd()
IN = ROOT/'merged_final_encoded.csv'
if not IN.exists():
    IN = ROOT/'merged_final.csv'
OUT = ROOT/'soil_ml_ready.csv'

df = read_csv_fallback(IN)
df = normalize_columns(df)
print('Loaded', IN, 'shape', df.shape)

# ensure month & season
from utils import ensure_month_features, ensure_season_features
DF = ensure_month_features(df)
DF = ensure_season_features(DF)

# identify target
target = detect_target_column(DF)
print('Target:', target)

# drop duplicates
DF = DF.drop_duplicates().reset_index(drop=True)

# create lag and rolling features per state,district
DF = DF.sort_values(['state_name','districtname','Year','Month_num','Day']).reset_index(drop=True)
grp = DF.groupby(['state_name','districtname'])
DF['lag_1'] = grp[target].shift(1)
DF['lag_7'] = grp[target].shift(7)
DF['rolling_3'] = grp[target].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
DF['rolling_6'] = grp[target].transform(lambda x: x.rolling(window=6, min_periods=1).mean())

# simple encodings
DF['state_freq'] = DF['state_name'].map(DF['state_name'].value_counts(normalize=True))
DF['district_id'] = DF['districtname'].astype('category').cat.codes

# target transform and class
DF['target_log1p'] = np.log1p(DF[target].clip(lower=0))
DF['moisture_class'] = pd.qcut(DF[target].rank(method='first'), q=3, labels=['Low','Medium','High'])

# save
DF.to_csv(OUT, index=False)
print('Saved ML-ready dataset to', OUT, 'shape', DF.shape)
