In [None]:
#loading the needed libraries 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor

from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [None]:
#loading data
data = pd.read_csv("./data/Crop_Yield_Data_challenge_2.csv")
data.head()

In [None]:
#getting features and target
X = data.drop(columns="Rice Yield (kg/ha)", axis=1)
y = data["Rice Yield (kg/ha)"].values

In [None]:
#getting the columns names. We are gonna use this for some transformations later
X_col_names = list(X.columns)

In [None]:
#to avoid data leakage, we are gonna already split data in train and test before some transforms
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [None]:
#converting the output from split in dataframe
X_train = pd.DataFrame(X_train)
X_train.columns = X_col_names

X_test = pd.DataFrame(X_test)
X_test.columns = X_col_names

In [None]:
#We will need the target feature for some transforms in the features set
X_train["Rice Yield (kg/ha)"] = y_train
X_test["Rice Yield (kg/ha)"] = y_test

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
#creating some aggrouped features
group = ["District", "Season(SA = Summer Autumn, WS = Winter Spring)"]
feature = "Rice Yield (kg/ha)"

df_group = pd.DataFrame(X_train.groupby(group)[feature].mean().rename(f"mean_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].max().rename(f"max_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].min().rename(f"min_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].std().rename(f"std_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].var().rename(f"var_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].median().rename(f"median_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.25).rename(f"q25_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.5).rename(f"q5_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.75).rename(f"q75_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
X_test = X_test.merge(df_group, on=group, how="left")

print(X_train.shape, X_test.shape)

In [None]:
#selecting the final features for the model
#NOTE: These features were selected after an intensive feature selection process that is not here
final_features = ['mean_Rice Yield (kg/ha)',
 'median_Rice Yield (kg/ha)',
 'q5_Rice Yield (kg/ha)']

X_train = X_train[final_features]
X_test = X_test[final_features]

In [None]:
#initializing the LGBM model with the best parameters found
#NOTE: These parameters were found after a RandomizedSearch process that is not here.
params = {'reg_lambda': 10,
 'reg_alpha': 10,
 'num_leaves': 61,
 'n_estimators': 500,
 'min_child_samples': 30,
 'max_depth': 10,
 'learning_rate': 0.005,
 'colsample_bytree': 0.7}

model = LGBMRegressor(**params)
model.fit(X_train, y_train)

In [None]:
#generating the predictions for the train set
y_pred_train = model.predict(X_train)

In [None]:
#checking the r2 for the training set because overfitting was the main problem
r2_train = r2_score(y_train, y_pred_train)
print(f"r2 on train data: {r2_train}")

In [None]:
#generating predictions for the test set
y_pred_test = model.predict(X_test)

In [None]:
#check the r2 score for the test set to compare it with train
r2_test = r2_score(y_test, y_pred_test)
print(f"r2 on train data: {r2_test}")

# Submission

In [None]:
#loading the submission file template
test_file = pd.read_csv("./data/Challenge_2_submission_template.csv")
test_file.head()

In [None]:
#generating the features for this set
group = ["District", "Season(SA = Summer Autumn, WS = Winter Spring)"]
feature = "Rice Yield (kg/ha)"

df_group = pd.DataFrame(X_train.groupby(group)[feature].mean().rename(f"mean_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].max().rename(f"max_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].min().rename(f"min_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].std().rename(f"std_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].var().rename(f"var_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].median().rename(f"median_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.25).rename(f"q25_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.5).rename(f"q5_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

df_group = pd.DataFrame(X_train.groupby(group)[feature].quantile(0.75).rename(f"q75_{feature}"))
df_group = df_group.reset_index()
X_train = X_train.merge(df_group, on=group, how="left")
test_file = test_file.merge(df_group, on=group, how="left")

print(X_train.shape, test_file.shape)

In [None]:
#filtering only the final columns
test_file = test_file[final_features]
test_file.shape

In [None]:
#making the submission predictions
y_submission = model.predict(test_file)

In [None]:
#reading the template submission file again - we will use this for the submission
test_file = pd.read_csv("./data/Challenge_2_submission_template.csv")
test_file.head()

In [None]:
#adding the column with the predictions
test_file['Predicted Rice Yield (kg/ha)'] = y_submission
test_file.head()

In [None]:
# #saving the submission file
# test_file.to_csv("challenge_2_submission_rice_crop_yield_prediction.csv", index=False)