## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [3]:
# Charger les données
df = pd.read_csv("D:/End_To_End_Data_Science_Project/ML_Project_Regression/notebook/paddydataset.csv")

#### Show Top 5 Records

In [4]:
df.head()

Unnamed: 0,Hectares,Agriblock,Variety,Soil Types,Seedrate(in Kg),LP_Mainfield(in Tonnes),Nursery,Nursery area (Cents),LP_nurseryarea(in Tonnes),DAP_20days,...,Wind Direction_D1_D30,Wind Direction_D31_D60,Wind Direction_D61_D90,Wind Direction_D91_D120,Relative Humidity_D1_D30,Relative Humidity_D31_D60,Relative Humidity_D61_D90,Relative Humidity_D91_D120,Trash(in bundles),Paddy yield(in Kg)
0,6,Cuddalore,CO_43,alluvial,150,75.0,dry,120,6,240,...,SW,W,NNW,WSW,72.0,78,88,85,540,35028
1,6,Kurinjipadi,ponmani,clay,150,75.0,wet,120,6,240,...,NW,S,SE,SSE,64.6,85,84,87,600,35412
2,6,Panruti,delux ponni,alluvial,150,75.0,dry,120,6,240,...,ENE,NE,NNE,W,85.0,96,84,79,600,36300
3,6,Kallakurichi,CO_43,clay,150,75.0,wet,120,6,240,...,W,WNW,SE,S,88.5,95,81,84,540,35016
4,6,Sankarapuram,ponmani,alluvial,150,75.0,dry,120,6,240,...,SSE,W,SW,NW,72.7,91,83,81,600,34044


In [5]:
# Suppression des doublons
df = df.drop_duplicates()

# Réinitialisation de l'index
# 'drop=True' évite que l'ancien index ne soit ajouté comme une nouvelle colonne
df = df.reset_index(drop=True)

# Vérification finale de la forme (shape) du dataset
print(f"Après suppression des doublons, la forme du dataset est : {df.shape}")

Après suppression des doublons, la forme du dataset est : (2338, 45)


#### Preparing X and Y variables

In [6]:
X = df.drop(columns=['Paddy yield(in Kg)'],axis=1)

In [7]:
X.head()

Unnamed: 0,Hectares,Agriblock,Variety,Soil Types,Seedrate(in Kg),LP_Mainfield(in Tonnes),Nursery,Nursery area (Cents),LP_nurseryarea(in Tonnes),DAP_20days,...,Inst Wind Speed_D91_D120(in Knots),Wind Direction_D1_D30,Wind Direction_D31_D60,Wind Direction_D61_D90,Wind Direction_D91_D120,Relative Humidity_D1_D30,Relative Humidity_D31_D60,Relative Humidity_D61_D90,Relative Humidity_D91_D120,Trash(in bundles)
0,6,Cuddalore,CO_43,alluvial,150,75.0,dry,120,6,240,...,10,SW,W,NNW,WSW,72.0,78,88,85,540
1,6,Kurinjipadi,ponmani,clay,150,75.0,wet,120,6,240,...,6,NW,S,SE,SSE,64.6,85,84,87,600
2,6,Panruti,delux ponni,alluvial,150,75.0,dry,120,6,240,...,12,ENE,NE,NNE,W,85.0,96,84,79,600
3,6,Kallakurichi,CO_43,clay,150,75.0,wet,120,6,240,...,6,W,WNW,SE,S,88.5,95,81,84,540
4,6,Sankarapuram,ponmani,alluvial,150,75.0,dry,120,6,240,...,12,SSE,W,SW,NW,72.7,91,83,81,600


In [8]:
# Define numerical and categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print("\n" + "=" * 60)
print("TYPES OF VARIABLES")
print("=" * 60)
print(f'We have {len(numeric_features)} numerical variables : {numeric_features}')
print(f'\nWe have {len(categorical_features)} categorical variables : {categorical_features}')


TYPES OF VARIABLES
We have 37 numerical variables : ['Hectares ', 'Seedrate(in Kg)', 'LP_Mainfield(in Tonnes)', 'Nursery area (Cents)', 'LP_nurseryarea(in Tonnes)', 'DAP_20days', 'Weed28D_thiobencarb', 'Urea_40Days', 'Potassh_50Days', 'Micronutrients_70Days', 'Pest_60Day(in ml)', '30DRain( in mm)', '30DAI(in mm)', '30_50DRain( in mm)', '30_50DAI(in mm)', '51_70DRain(in mm)', '51_70AI(in mm)', '71_105DRain(in mm)', '71_105DAI(in mm)', 'Min temp_D1_D30', 'Max temp_D1_D30', 'Min temp_D31_D60', 'Max temp_D31_D60', 'Min temp_D61_D90', 'Max temp_D61_D90', 'Min temp_D91_D120', 'Max temp_D91_D120', 'Inst Wind Speed_D1_D30(in Knots)', 'Inst Wind Speed_D31_D60(in Knots)', 'Inst Wind Speed_D61_D90(in Knots)', 'Inst Wind Speed_D91_D120(in Knots)', 'Relative Humidity_D1_D30', 'Relative Humidity_D31_D60', 'Relative Humidity_D61_D90', 'Relative Humidity_D91_D120', 'Trash(in bundles)', 'Paddy yield(in Kg)']

We have 8 categorical variables : ['Agriblock', 'Variety', 'Soil Types', 'Nursery', 'Wind Dir

In [9]:
print("\n" + "=" * 60)
print("CATEGORIES OF VARIABLES")
print("=" * 60)

for col in categorical_features:
    print(f"\nCategories in '{col}':")
    print(df[col].unique())


CATEGORIES OF VARIABLES

Categories in 'Agriblock':
['Cuddalore' 'Kurinjipadi' 'Panruti' 'Kallakurichi' 'Sankarapuram'
 'Chinnasalem']

Categories in 'Variety':
['CO_43' 'ponmani' 'delux ponni']

Categories in 'Soil Types':
['alluvial' 'clay']

Categories in 'Nursery':
['dry' 'wet']

Categories in 'Wind Direction_D1_D30':
['SW' 'NW' 'ENE' 'W' 'SSE' 'E']

Categories in 'Wind Direction_D31_D60':
['W' 'S' 'NE' 'WNW' 'ENE']

Categories in 'Wind Direction_D61_D90':
['NNW' 'SE' 'NNE' 'SW' 'NE']

Categories in 'Wind Direction_D91_D120':
['WSW' 'SSE' 'W' 'S' 'NW' 'NNW']


In [10]:
y = df['Paddy yield(in Kg)']

In [11]:
y

0       35028
1       35412
2       36300
3       35016
4       34044
        ...  
2333     5873
2334     5492
2335     5836
2336     5723
2337     5723
Name: Paddy yield(in Kg), Length: 2338, dtype: int64

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# --- CLASSE PERSONNALISÉE POUR LE TARGET GUIDED ENCODING ---
class TargetGuidedOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mappings = {}

    def fit(self, X, y):
        temp_df = pd.DataFrame(X).copy()
        temp_df['target'] = y
        
        for col in temp_df.columns:
            if col != 'target':
                # On calcule la moyenne de la cible pour chaque catégorie
                # Puis on trie et on attribue un rang (0, 1, 2...)
                ordered_labels = temp_df.groupby([col])['target'].mean().sort_values().index
                self.mappings[col] = {k: i for i, k in enumerate(ordered_labels, 0)}
        return self

    def transform(self, X):
        X_copy = pd.DataFrame(X).copy()
        for col, mapping in self.mappings.items():
            # Application du mapping appris sur le train
            X_copy[col] = X_copy[col].map(mapping).fillna(-1)
        return X_copy

### Prétraitement et Split

In [13]:
# Définition de X et y 
X = df.drop(columns=['Paddy yield(in Kg)'], axis=1)
y = df['Paddy yield(in Kg)']

# Identification des colonnes
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# Création du Preprocessor
numeric_transformer = StandardScaler()
target_encoder = TargetGuidedOrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("TargetOrdinal", target_encoder, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

# Split AVANT la transformation pour éviter le data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transformation des données
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [14]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# --- ÉTAPE A : Récupérer les noms des colonnes après transformation ---
# L'ordre dans ColumnTransformer est respecté : d'abord les catégorielles encodées, puis les numériques
all_feature_names = list(cat_features) + list(num_features)

# On reconvertit en DataFrame pour pouvoir manipuler les noms de colonnes
X_train_df = pd.DataFrame(X_train, columns=all_feature_names)
X_test_df = pd.DataFrame(X_test, columns=all_feature_names)

# --- ÉTAPE B : Feature Selection avec Lasso ---
# L'alpha contrôle la force de la sélection (plus il est grand, plus on élimine de colonnes)
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=42)) 
feature_sel_model.fit(X_train_df, y_train)

# --- ÉTAPE C : Identifier les variables conservées ---
selected_feat = X_train_df.columns[(feature_sel_model.get_support())]

# Statistiques
print('Total features: {}'.format((X_train_df.shape[1])))
print('Selected features: {}'.format(len(selected_feat)))
print('Features with coefficients shrank to zero: {}'.format(
    np.sum(feature_sel_model.estimator_.coef_ == 0)))

# --- ÉTAPE D : Filtrer les datasets ---
X_train_selected = X_train_df[selected_feat]
X_test_selected = X_test_df[selected_feat]

print("\nVariables sélectionnées :")
print(selected_feat.tolist())

Total features: 44
Selected features: 28
Features with coefficients shrank to zero: 6

Variables sélectionnées :
['Agriblock', 'Variety', 'Soil Types', 'Nursery', 'Wind Direction_D31_D60', 'Wind Direction_D61_D90', 'Wind Direction_D91_D120', 'Hectares ', '30DRain( in mm)', '30_50DRain( in mm)', '51_70DRain(in mm)', '71_105DRain(in mm)', 'Min temp_D1_D30', 'Max temp_D1_D30', 'Min temp_D31_D60', 'Max temp_D31_D60', 'Min temp_D61_D90', 'Max temp_D61_D90', 'Min temp_D91_D120', 'Max temp_D91_D120', 'Inst Wind Speed_D1_D30(in Knots)', 'Inst Wind Speed_D31_D60(in Knots)', 'Inst Wind Speed_D61_D90(in Knots)', 'Inst Wind Speed_D91_D120(in Knots)', 'Relative Humidity_D1_D30', 'Relative Humidity_D31_D60', 'Relative Humidity_D61_D90', 'Trash(in bundles)']


  model = cd_fast.enet_coordinate_descent(


#### Entraînement et Comparaison des Modèles

In [15]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# On définit une liste pour stocker les résultats et pouvoir comparer à la fin
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

# --- BOUCLE D'ENTRAÎNEMENT ---
for name, model in models.items():
    # Entraînement sur les données SÉLECTIONNÉES par le Lasso
    model.fit(X_train_selected, y_train)

    # Prédictions
    y_train_pred = model.predict(X_train_selected)
    y_test_pred = model.predict(X_test_selected)
    
    # Évaluation via la fonction définie précédemment
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"Structure du modèle : {name}")
    model_list.append(name)

    print('--- Performance sur le Training Set ---')
    print(f"- RMSE: {model_train_rmse:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")

    print('--- Performance sur le Test Set ---')
    print(f"- RMSE: {model_test_rmse:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")
    
    r2_list.append(model_test_r2)
    print('='*35 + '\n')


Structure du modèle : Linear Regression
--- Performance sur le Training Set ---
- RMSE: 955.6519
- R2 Score: 0.9894
--- Performance sur le Test Set ---
- RMSE: 1024.7810
- R2 Score: 0.9877

Structure du modèle : Lasso
--- Performance sur le Training Set ---
- RMSE: 955.7159
- R2 Score: 0.9894
--- Performance sur le Test Set ---
- RMSE: 1024.5910
- R2 Score: 0.9877

Structure du modèle : Ridge
--- Performance sur le Training Set ---
- RMSE: 955.9810
- R2 Score: 0.9894
--- Performance sur le Test Set ---
- RMSE: 1025.7625
- R2 Score: 0.9877

Structure du modèle : K-Neighbors Regressor
--- Performance sur le Training Set ---
- RMSE: 1397.1853
- R2 Score: 0.9773
--- Performance sur le Test Set ---
- RMSE: 1691.9295
- R2 Score: 0.9666

Structure du modèle : Decision Tree
--- Performance sur le Training Set ---
- RMSE: 735.2112
- R2 Score: 0.9937
--- Performance sur le Test Set ---
- RMSE: 979.7394
- R2 Score: 0.9888

Structure du modèle : Random Forest Regressor
--- Performance sur le Train

### Results

In [16]:
# --- RÉSUMÉ FINAL ---
results_df = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"], ascending=False)
print("Classement des modèles par R2 Score :")
print(results_df)

Classement des modèles par R2 Score :
                Model Name  R2_Score
8       AdaBoost Regressor  0.989346
7    CatBoosting Regressor  0.989230
5  Random Forest Regressor  0.989019
6             XGBRegressor  0.988935
4            Decision Tree  0.988788
1                    Lasso  0.987738
0        Linear Regression  0.987733
2                    Ridge  0.987710
3    K-Neighbors Regressor  0.966562
