AI ENVIRONMENTAL MONITORING

In [None]:
## IMPORTANT IMPORTANT
## MODEL IMPORTS 
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## EVALUATION IMPORTS
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

## TOOLS IMPORTS
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

## LOAD DATA
pd1 = pd.read_csv('data/crop1.csv')
pd2 = pd.read_csv('data/crop2.csv')
pd3 = pd.read_csv('data/crop3.csv')
pd4 = pd.read_csv('data/crop4.csv')
print(pd1.head, pd2.head, pd3.head, pd4.head)


##GRID SEARCH PARAMETERS FOR MODELS
param_grid = {
    'Nearest Neighbors': {'n_neighbors': [1, 3, 5, 7]},
    'Decision Tree': {'max_depth': [None, 3, 5, 10]},
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10]},
    'Support Vector Machine': {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

# DATA PREPROCESSING FUNCTION
def preprocess_data(X_train, X_test):
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    return X_train_scaled, X_test_scaled

#GRID SEARCH FUNCTION
def grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_


## PD1 DATASET
X1 = pd1.drop('label', axis=1)
y1 = pd1['label']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

preprocess_data(X1_train, X1_test)

## PD2 DATASET
X2 = pd2.drop('label', axis=1)
y2 = pd2['label']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

## PD3 DATASET
X3 = pd3.drop('label', axis=1)
y3 = pd3['label']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

## PD4 DATASET
X4 = pd4.drop('label', axis=1)
y4 = pd4['label']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=42)


<bound method NDFrame.head of       farm_id       region crop_type  soil_moisture_%  soil_pH  temperature_C  \
0    FARM0001  North India     Wheat            35.95     5.99          17.79   
1    FARM0002    South USA   Soybean            19.74     7.24          30.18   
2    FARM0003    South USA     Wheat            29.32     7.16          27.37   
3    FARM0004  Central USA     Maize            17.33     6.03          33.73   
4    FARM0005  Central USA    Cotton            19.37     5.92          33.86   
..        ...          ...       ...              ...      ...            ...   
495  FARM0496  Central USA      Rice            42.85     6.70          30.85   
496  FARM0497  North India   Soybean            34.22     6.75          17.46   
497  FARM0498  North India    Cotton            15.93     5.72          17.03   
498  FARM0499  Central USA   Soybean            38.61     6.20          17.08   
499  FARM0500  North India     Wheat            30.22     7.42          20.57  