In [219]:
import pandas as pd 
import numpy as np

In [220]:
data = pd.read_csv("indian_crop_data_realistic_v2.csv")
df = pd.DataFrame(data)[:90000]
df.head()
df_copy = df

In [221]:
# #manual one hot encoding
# import torch
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# soil = le.fit_transform(df['Soil_Type'])
# crop = le.fit_transform(df['Crop_Type'])
# from torch.nn import functional
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# soil_encodings = functional.one_hot(torch.tensor(soil , dtype = torch.long).to(device))
# crop_encodings = functional.one_hot(torch.tensor(crop , dtype=torch.long).to(device))
# soil_encodings = soil_encodings.cpu()
# crop_encodings = crop_encodings.cpu()
# numerical_df = df.select_dtypes(include=np.number)
# numerical_df.corr()
# #print(soil_encodings)
# df['Soil_Type'] = np.array(soil_encodings)
# df['Crop_Type'] = np.array(crop_encodings)

In [None]:
#list the strings as categorical data and this will let lightgbm  handle the categorical features
df = df_copy
cat = ['Soil_Type' , 'Crop_Type']
df[cat] = df[cat].astype('category')

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 8 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Soil_Type                              90000 non-null  category
 1   Crop_Type                              90000 non-null  category
 2   Soil_Moisture_at_Planting              90000 non-null  float64 
 3   Average_Planting_Depth_in_cm           90000 non-null  float64 
 4   Average_Plant_Spacing_in_cm            90000 non-null  float64 
 5   Average_Rainfall_during_Crop_Cycle     90000 non-null  float64 
 6   Average_Temperature_during_Crop_Cycle  90000 non-null  float64 
 7   Yield_kg_per_hectare                   90000 non-null  float64 
dtypes: category(2), float64(6)
memory usage: 4.3 MB


In [224]:
df.describe()

Unnamed: 0,Soil_Moisture_at_Planting,Average_Planting_Depth_in_cm,Average_Plant_Spacing_in_cm,Average_Rainfall_during_Crop_Cycle,Average_Temperature_during_Crop_Cycle,Yield_kg_per_hectare
count,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0
mean,24.396106,6.126088,14.836724,590.605656,24.260763,1203.434587
std,10.164443,2.701629,8.02701,209.014088,7.405037,1205.739379
min,5.0,1.0,2.0,50.0,5.0,0.25
25%,17.2175,4.15,8.74,439.35,19.25,302.9
50%,24.24,5.98,13.84,591.57,24.56,875.39
75%,31.31,8.0,20.4825,741.55,29.4525,1576.3825
max,71.75,16.67,44.77,1430.9,52.53,8660.11


In [None]:

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

np.random.seed(42)
num_samples = 1000

target_features = ['Average_Planting_Depth_in_cm']
X = df.drop(target_features, axis=1)
Y = df.loc[ : , target_features]

#splittign the data in to training and testing parts
X_train , X_test , Y_train , Y_test = train_test_split(X, Y , test_size=0.25 , random_state=123)

lgb_train = lgb.Dataset(X_train , Y_train)
lgb_test = lgb.Dataset(X_test , Y_test , reference=lgb_train)


#  the model hyperparameters
params = {
    'objective':'regression_l1',
    'metric':'rmse',
    'boosting_type':'gbdt',
    'n_estimators':50000,
    'num_leaves':80,
    'learning_rate':0.02,
    'feature_fraction':0.9,
    #'bagging_fraction':0.8,
    'bagging_freq':5 , 
    'verbose':-1
}


In [226]:
#ok so now the data set is ready to lets train the lightgbm dataset for the purpose of prediction 
print("Startign to train the model")
model = lgb.train(params,
                  lgb_train , 
                  num_boost_round=50000,
                  valid_sets=[lgb_train , lgb_test],
                  callbacks=[lgb.early_stopping(200) , lgb.log_evaluation(period=100)]
                
                  )

y_pred = model.predict(X_test , num_iteration = model.best_iteration)
rmse = np.sqrt(mean_squared_error(y_pred , Y_test))
print(f"The model training has finished and the best iteration of the model is : {model.best_iteration} and the prediction is : {y_pred} and the true val is : {Y_test}")
print(f"The root mean squared error on the test is : {rmse:.4f}")


Startign to train the model
Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 1.93445	valid_1's rmse: 1.947
[200]	training's rmse: 1.90483	valid_1's rmse: 1.93065
[300]	training's rmse: 1.89079	valid_1's rmse: 1.93299
Early stopping, best iteration is:
[164]	training's rmse: 1.90919	valid_1's rmse: 1.92951
The model training has finished and the best iteration of the model is : 164 and the prediction is : [6.47719449 4.23374926 6.54148171 ... 8.80665807 8.80474073 6.55554485] and the true val is :        Average_Planting_Depth_in_cm
58766                          6.55
27655                          6.20
73914                          6.77
10715                          5.31
57447                         13.81
...                             ...
62448                          5.90
78778                          5.55
46851                          8.25
27256                         12.90
54749                          8.74

[22500 rows x 1 columns]
The 

In [227]:
print(f"\nThe root mean squared error on the test is: {rmse:.4f}")

model.save_model('planting_depth_model_final.txt')

print("\nModel successfully saved to 'planting_depth_model_final.txt'")



The root mean squared error on the test is: 1.9295

Model successfully saved to 'planting_depth_model_final.txt'
