In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nitrogen-data/tn1_final.csv
/kaggle/input/hsihhi/cleaned_tn.csv
/kaggle/input/totalnitrogen-sentinel/Final_merged_tn_sentinel_mgl.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from math import sqrt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler  
import statsmodels.api as sm
import warnings
from joblib import dump
warnings.filterwarnings("ignore")

In [3]:
dataset=pd.read_csv('/kaggle/input/hsihhi/cleaned_tn.csv')
dataset.head()

Unnamed: 0,STATION,Date,Band1_Mean,Band2_Mean,Band3_Mean,Band4_Mean,Band5_Mean,Band6_Mean,Band7_Mean,Band8_Mean,Band9_Mean,Band10_Mean,Band11_Mean,Band12_Mean,Band13_Mean,TEST VALUE
0,C1,2017-08-29,1232.0,913.0,688.0,394.0,369.0,393.0,395.0,333.0,339.0,90.0,13.0,74.0,36.0,0.424
1,C1,2017-09-21,1179.0,857.0,622.0,339.0,292.0,268.0,265.0,210.0,205.5,57.0,8.0,50.0,24.0,0.473
2,C1,2018-05-14,1186.0,888.0,674.0,427.5,383.5,347.0,339.0,283.5,295.0,126.5,9.5,160.0,128.0,0.417
3,C1,2018-06-15,1257.0,984.5,794.0,595.0,555.0,532.0,534.5,466.0,500.0,204.5,9.0,365.5,311.5,0.446
4,C1,2018-09-17,1233.0,910.0,642.0,370.0,326.0,308.0,293.0,242.0,235.0,88.0,8.0,62.0,39.0,0.374


In [4]:
dataset.dropna(inplace=True)
print(dataset.count())

STATION        140
Date           140
Band1_Mean     140
Band2_Mean     140
Band3_Mean     140
Band4_Mean     140
Band5_Mean     140
Band6_Mean     140
Band7_Mean     140
Band8_Mean     140
Band9_Mean     140
Band10_Mean    140
Band11_Mean    140
Band12_Mean    140
Band13_Mean    140
TEST VALUE     140
dtype: int64


> **Support Vector Regression**

In [5]:
features = ['Band2_Mean','Band3_Mean','Band4_Mean','Band5_Mean','Band6_Mean','Band7_Mean','Band8_Mean','Band9_Mean']
label = ['TEST VALUE']

X = dataset.loc[:, features].values
y = dataset.loc[:, label].values

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model Developement
svr_model = SVR(kernel='rbf')

param_dist = {
    'C': [x for x in range(1, 10001, 1)],
    'epsilon': [x for x in np.arange(0.001, 1001, 0.001)]
}

random_search = RandomizedSearchCV(SVR(), param_distributions=param_dist, n_iter=200, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

randomDf = pd.DataFrame(random_search.cv_results_)
best_svr_model = random_search.best_estimator_

y_pred = best_svr_model.predict(X_test)

test_r2 = r2_score(y_test, y_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
bias = best_svr_model.intercept_

print(f"Best Parameters: {random_search.best_params_}")
print(f"R2 Score: {test_r2}")
print(f"RMSE: {test_rmse}")

svr_model_filename = 'TN-svr.joblib'
dump(best_svr_model, svr_model_filename)

print(f"Best SVR model saved to {svr_model_filename}")

Best Parameters: {'epsilon': 0.097, 'C': 348}
R2 Score: -1.8488485416170755
RMSE: 0.3769725570196915
Best SVR model saved to TN-svr.joblib


> **Gaussian Regression**

In [6]:
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor

X = dataset.loc[:, features].values
y = dataset.loc[:, label].values

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model Developement
kernel = DotProduct() + WhiteKernel()
regressor = GaussianProcessRegressor(kernel=kernel)

regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)


#Evaluate the Model
r2 = r2_score(y_test,predictions)
rmse = sqrt(mean_squared_error(y_test,predictions))


print(f'R2 Score: {r2}')
print(f"RMSE: {rmse}")

gp_model_filename = 'TN-gaussian.joblib'
dump(regressor, gp_model_filename)

print(f"GaussianProcessRegressor model saved to {gp_model_filename}")

R2 Score: 0.7552445120698776
RMSE: 0.11049465602843993
GaussianProcessRegressor model saved to TN-gaussian.joblib


> **XGBoost Regression**

In [7]:
from xgboost import XGBRegressor
from scipy.stats import uniform

X = dataset.loc[:, features].values
y = dataset.loc[:, label].values

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model Developement
xgb = XGBRegressor()
param_grid = {
    'n_estimators': range(1, 1001),
    'max_depth': range(1, 11),
    'learning_rate': uniform(0.01, 1),
    'subsample': uniform(0.1, 1),
    'colsample_bytree': uniform(0.1, 1)
}

random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=50, scoring='r2', cv=5, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_

# Evaluate the model
y_pred = best_xgb.predict(X_test)
r2 = best_xgb.score(X_test, y_test)

print("Best hyperparameters:", random_search.best_params_)
print("R-squared score:", r2)
rmse = sqrt(mean_squared_error(y_test,predictions))
print(f"RMSE: {rmse}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters: {'colsample_bytree': 0.6924145688620424, 'learning_rate': 0.056450412719997727, 'max_depth': 3, 'n_estimators': 647, 'subsample': 0.2705241236872915}
R-squared score: 0.5190352366334662
RMSE: 0.11049465602843993


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

X = dataset.loc[:, features].values
y = dataset.loc[:, label].values

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Random Forest model
rf = RandomForestRegressor()

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [1, 2, 5, 10],
    'min_samples_leaf': [x for x in range(1, 11, 1)]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())

best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"R^2 Score on Test Set: {r2}")
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(f"RMSE: {rmse}")

rf_model_filename = 'TN-rf.joblib'
dump(best_rf, rf_model_filename)

print(f"RandomForestRegressor model saved to {rf_model_filename}")

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
R^2 Score on Test Set: 0.6892831182618413
RMSE: 0.1244965968388707
RandomForestRegressor model saved to TN-rf.joblib


In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)

# Fit the model
gradient_boosting_model.fit(X_train, y_train)

# Make predictions
y_pred = gradient_boosting_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R^2 Score on Test Set: {r2}")
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(f"RMSE: {rmse}")
gb_model_filename = 'TN-gradient.joblib'
dump(gradient_boosting_model, gb_model_filename)

print(f"GradientBoostingRegressor model saved to {gb_model_filename}")

R^2 Score on Test Set: 0.6618293621904372
RMSE: 0.1298802163521931
GradientBoostingRegressor model saved to TN-gradient.joblib
