# Train Models

## 1. Prepare environment and load libraries

In [1]:
# Set current working directory to the src folder
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.decomposition import PCA


## 2. Generate fake data

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_fake_irrigation_data(num_samples=100, noise_probability=0.1, seed=42):
    """
    Generates a realistic dataset for irrigation recommendations using Gaussian distributions.

    Args:
        num_samples: The number of samples to generate.
        noise_probability: Probability of adding noise to irrigation decisions.

    Returns:
        A pandas DataFrame.
    """
    np.random.seed(seed)
    crops = ['Corn', 'Soybean', 'Wheat']
    phenological_stages = ['V6', 'R1', 'R6', 'Flowering', 'Maturity']

    data = {
        'K': [],
        'P': [],
        'pH': [],
        'Moisture': [],
        'Crop': [],
        'Phenological_Stage': [],
        'Irrigate': []
    }

    for i in range(num_samples):
        crop = np.random.choice(crops)
        stage = np.random.choice(phenological_stages)

        # Gaussian distributions for sensor readings (adjust means and std devs as needed)
        k = np.random.normal(loc=200, scale=50)  # Potassium (mg/kg) - Mean 200, Std Dev 50
        p = np.random.normal(loc=100, scale=30)  # Phosphorus (mg/kg) - Mean 100, Std Dev 30
        ph = np.random.normal(loc=6.5, scale=0.5) # Soil pH - Mean 6.5, Std Dev 0.5
        moisture = np.random.normal(loc=40, scale=15)  # Soil moisture (%) - Mean 40, Std Dev 15


        # Clip values to realistic ranges (important for Gaussian distributions)
        k = np.clip(k, 100, 350)  
        p = np.clip(p, 50, 180)
        ph = np.clip(ph, 5.5, 7.5)
        moisture = np.clip(moisture, 10, 80)


        irrigate = False

        # Crop-specific irrigation logic (including K, P, and pH)
        if crop == 'Corn':
            if stage in ['R1', 'R6']:
                if moisture < 30 or k < 150 or p < 75 or ph < 6.0 or ph > 7.0:
                    irrigate = True
            elif moisture < 20 or k < 100 or p < 50:
                irrigate = True

        elif crop == 'Soybean':
            if stage == 'Flowering':
                if moisture < 40 or k < 180 or p < 80 or ph < 6.2 or ph > 7.2:
                    irrigate = True
            elif moisture < 25 or k < 120 or p < 60:
                irrigate = True

        elif crop == 'Wheat':
            if stage == 'Maturity':
                if moisture < 35 or k < 120 or p < 70 or ph < 5.8 or ph > 6.8:
                    irrigate = True
            elif moisture < 20 or k < 100 or p < 55:
                irrigate = True

        # Add noise (near boundaries)
        near_boundary = False
        if (moisture < 25 or k < 100 or p < 50 or ph < 5.8 or ph > 7.2): near_boundary = True

        if near_boundary and np.random.rand() < noise_probability:
            irrigate = not irrigate

        data['K'].append(k)
        data['P'].append(p)
        data['pH'].append(ph)
        data['Moisture'].append(moisture)
        data['Crop'].append(crop)
        data['Phenological_Stage'].append(stage)
        data['Irrigate'].append(irrigate)

    return pd.DataFrame(data)


# Example usage:
df = generate_fake_irrigation_data(num_samples=500, noise_probability=0.10)
print(df.head())
print(df.describe())
#df.to_csv('irrigation_data_gaussian.csv', index=False)

            K           P        pH   Moisture     Crop Phenological_Stage  \
0  144.405994  109.567066  6.639521  55.157729    Wheat          Flowering   
1  170.956093   84.244906  6.214310  26.138757    Wheat           Maturity   
2  113.754108   83.131374  5.993584  44.713710  Soybean           Maturity   
3  207.123230   98.960434  7.067170  38.428817    Wheat                 R6   
4  185.330043   99.104843  6.547563  49.969815    Wheat           Maturity   

   Irrigate  
0     False  
1      True  
2      True  
3     False  
4     False  
                K           P          pH    Moisture
count  500.000000  500.000000  500.000000  500.000000
mean   201.497700  102.214548    6.495519   40.783996
std     50.712149   28.974022    0.454040   15.108114
min    100.000000   50.000000    5.500000   10.000000
25%    162.205225   81.309135    6.199047   30.002131
50%    204.954945  101.401257    6.475943   41.714489
75%    235.988889  122.906502    6.824187   50.845115
max    350.0000

## 3. Preproccess data

### 3.1. Data normalization (for neural features)

In [4]:
from sklearn.preprocessing import StandardScaler

def normalize_features(data, columns):
    """
    Normalizes the specified numerical features in the given data.

    Args:
        data: The input data as a pandas DataFrame.
        columns: The list of columns to normalize.

    Returns:
        A pandas DataFrame with normalized features.
    """
    scaler = StandardScaler()
    data[columns] = scaler.fit_transform(data[columns])
    return data


# Normalize the numerical features
df_normalized = normalize_features(df, ['K', 'P', 'pH', 'Moisture'])
print(df_normalized.head())

          K         P        pH  Moisture     Crop Phenological_Stage  \
0 -1.126927  0.254017  0.317475  0.952344    Wheat          Flowering   
1 -0.602857 -0.620820 -0.619969 -0.970333    Wheat           Maturity   
2 -1.731961 -0.659290 -1.106593  0.260367  Soybean           Maturity   
3  0.111042 -0.112424  1.260294 -0.156044    Wheat                 R6   
4 -0.319132 -0.107435  0.114740  0.608615    Wheat           Maturity   

   Irrigate  
0     False  
1      True  
2      True  
3     False  
4     False  


### 3.2. One-hot encoding (for categorical features)

In [5]:
def one_hot_encode_features(data, columns):
    """
    One-hot encodes the specified categorical features in the given data.

    Args:
        data: The input data as a pandas DataFrame.
        columns: The list of columns to one-hot encode.

    Returns:
        A pandas DataFrame with one-hot encoded features.
    """
    return pd.get_dummies(data, columns=columns)

# One-hot encode the categorical features
categorical_columns = ['Crop', 'Phenological_Stage']
df_normalized = one_hot_encode_features(df_normalized, categorical_columns)
print(df_normalized.head())

          K         P        pH  Moisture  Irrigate  Crop_Corn  Crop_Soybean  \
0 -1.126927  0.254017  0.317475  0.952344     False      False         False   
1 -0.602857 -0.620820 -0.619969 -0.970333      True      False         False   
2 -1.731961 -0.659290 -1.106593  0.260367      True      False          True   
3  0.111042 -0.112424  1.260294 -0.156044     False      False         False   
4 -0.319132 -0.107435  0.114740  0.608615     False      False         False   

   Crop_Wheat  Phenological_Stage_Flowering  Phenological_Stage_Maturity  \
0        True                          True                        False   
1        True                         False                         True   
2       False                         False                         True   
3        True                         False                        False   
4        True                         False                         True   

   Phenological_Stage_R1  Phenological_Stage_R6  Phenological_

### 3.3. Split data

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data = train_test_split(df_normalized, test_size=0.2, random_state=42)

print("Training Data:")
print(train_data.head())
print("Testing Data:")
print(test_data.head())

Training Data:
            K         P        pH  Moisture  Irrigate  Crop_Corn  \
249 -0.956635 -0.225848 -1.323601  0.544288     False      False   
433  0.422233  1.659014  0.003711  0.612677     False       True   
19  -0.924709 -0.923653 -0.522680 -0.618262     False       True   
322  0.818476  0.282286 -1.710734 -1.471176      True       True   
332  1.176286  1.736840 -1.108343  0.726102     False       True   

     Crop_Soybean  Crop_Wheat  Phenological_Stage_Flowering  \
249         False        True                          True   
433         False       False                         False   
19          False       False                         False   
322         False       False                         False   
332         False       False                         False   

     Phenological_Stage_Maturity  Phenological_Stage_R1  \
249                        False                  False   
433                        False                  False   
19                  

## 3.4 Prepare data

In [7]:
# float types
train_data = train_data.astype(float)
test_data = test_data.astype(float)

# Separate features and target variable
X_train = train_data.drop(columns=['Irrigate'])
y_train = train_data['Irrigate']
X_test = test_data.drop(columns=['Irrigate'])
y_test = test_data['Irrigate']

## 4. Train models

### 4.0 Auxiliary functions


In [8]:
def get_metrics(y_true, y_pred):
    """
    Computes various classification metrics.

    Args:
        y_true: The ground truth target labels.
        y_pred: The predicted target labels.

    Returns:
        A dictionary of metric names and values.
    """
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    metrics = {
        'accuracy': float(accuracy_score(y_true, y_pred)),
        'precision': float(precision_score(y_true, y_pred)),
        'recall': float(recall_score(y_true, y_pred)),
        'f1': float(f1_score(y_true, y_pred))
    }

    return metrics

### 4.1. Neural network

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define the neural network model
model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
metrics = get_metrics(y_test, y_pred)
print(metrics)


{'accuracy': 0.88, 'precision': 0.84, 'recall': 0.7241379310344828, 'f1': 0.7777777777777778}


### 4.2. Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_rf_pred = rf_model.predict(X_test)

# Evaluate the model
rf_metrics = get_metrics(y_test, y_rf_pred)
print(rf_metrics)

{'accuracy': 0.86, 'precision': 0.7777777777777778, 'recall': 0.7241379310344828, 'f1': 0.75}


### 4.3. Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

# Define the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42, max_iter=200)

# Train the model
log_reg_model.fit(X_train, y_train)

# Predict on the test set
y_log_reg_pred = log_reg_model.predict(X_test)

# Evaluate the model
log_reg_metrics = get_metrics(y_test, y_log_reg_pred)
print(log_reg_metrics)

{'accuracy': 0.78, 'precision': 0.6842105263157895, 'recall': 0.4482758620689655, 'f1': 0.5416666666666666}


### 4.4. K-Nearest Neighbors (KNN)

In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Define the KNN modelp
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Predict on the test set
y_knn_pred = knn_model.predict(X_test)

# Evaluate the model
knn_metrics = get_metrics(y_test, y_knn_pred)
print(knn_metrics)

{'accuracy': 0.86, 'precision': 0.8571428571428571, 'recall': 0.6206896551724138, 'f1': 0.72}


### 4.5. Support Vector Machine (SVM)

In [13]:
from sklearn.svm import SVC

# Define the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_svm_pred = svm_model.predict(X_test)

# Evaluate the model
svm_metrics = get_metrics(y_test, y_svm_pred)
print(svm_metrics)

{'accuracy': 0.77, 'precision': 0.65, 'recall': 0.4482758620689655, 'f1': 0.5306122448979592}


### 4.6. Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

# Define the Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict on the test set
y_nb_pred = nb_model.predict(X_test)

# Evaluate the model
nb_metrics = get_metrics(y_test, y_nb_pred)
print(nb_metrics)

{'accuracy': 0.78, 'precision': 0.7058823529411765, 'recall': 0.41379310344827586, 'f1': 0.5217391304347826}


## 5. Save models

In [15]:
import pickle

from db.ml_model_crud import create_ml_model # Assuming your CRUD functions are in db.crud

def save_trained_model(model, model_name, model_type, accuracy, precision, recall, f1_score, ml_library="scikit-learn"):
    """
    Saves a trained model and its metrics to the database.

    Args:
        model: The trained scikit-learn model object.  (Not used for saving, only for potential later retrieval)
        model_name: Name of the model (string).
        model_type: Type of model (e.g., "Classification", "Regression").
        accuracy: Accuracy score.
        precision: Precision score.
        recall: Recall score.
        f1_score: F1-score.
        ml_library: The machine learning library used (default: "scikit-learn").

    Returns:
        The database ID of the newly created MLModel entry, or None if an error occurs.
    """

    # Create the directory if it does not exist
    model_dir = os.path.join(os.getcwd(), '../src/models')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    try:
        #We are NOT pickling the model here. Only saving meta-information.
        # Save the model to a file
        model_path = os.path.join(model_dir, f"{model_name.replace(' ', '_').lower()}_model.pkl")
        with open(model_path, 'wb') as file:
            pickle.dump(model, file)

        # Save model parameters (this would need to be adjusted based on how your models store parameters)
        try:
            model_parameters = str(model.get_params())  # Convert to string for database storage
        except AttributeError:
            model_parameters = "Parameters not available" #Handle models without get_params

        # Save to database
        new_model = create_ml_model(
            model_name=model_name,
            model_type=model_type,
            model_parameters=model_parameters,  # Store model parameters as a string
            ml_library=ml_library,
            accuracy=accuracy,
            precision=precision,
            recall=recall,
            f1_score=f1_score,
        )

        print(f"Model '{model_name}' saved to database with ID: {new_model.id_model}")
        return new_model.id_model

    except Exception as e:
        print(f"Error saving model: {e}")
        return None



# Save the trained models to the database
save_trained_model(model, "Neural Network", "Classification", metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'])
save_trained_model(rf_model, "Random Forest", "Classification", rf_metrics['accuracy'], rf_metrics['precision'], rf_metrics['recall'], rf_metrics['f1'])
save_trained_model(log_reg_model, "Logistic Regression", "Classification", log_reg_metrics['accuracy'], log_reg_metrics['precision'], log_reg_metrics['recall'], log_reg_metrics['f1'])
save_trained_model(knn_model, "K-Nearest Neighbors", "Classification", knn_metrics['accuracy'], knn_metrics['precision'], knn_metrics['recall'], knn_metrics['f1'])
save_trained_model(svm_model, "Support Vector Machine", "Classification", svm_metrics['accuracy'], svm_metrics['precision'], svm_metrics['recall'], svm_metrics['f1'])
save_trained_model(nb_model, "Naive Bayes", "Classification", nb_metrics['accuracy'], nb_metrics['precision'], nb_metrics['recall'], nb_metrics['f1'])


Model 'Neural Network' saved to database with ID: 1
Model 'Random Forest' saved to database with ID: 2
Model 'Logistic Regression' saved to database with ID: 3
Model 'K-Nearest Neighbors' saved to database with ID: 4
Model 'Support Vector Machine' saved to database with ID: 5
Model 'Naive Bayes' saved to database with ID: 6


6