In [None]:
import pandas as pd
import numpy as np
import sys
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def main():
    print("Choose an option:")
    print("1. RICE")
    print("2. TOMATO")

    choice = input("Enter your choice (1 or 2): ").strip()

    if choice == "1":
        # Load dataset
      df = pd.read_csv("data69.csv")  # Replace with actual file path

      # Selecting features and target
      X = df[['area (bigha)', 'seeds_shown', 'PH level', 'water (ml)', 'Investment/bigha']]
      y = df['Production']

      # Splitting data into train and test sets
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

      # Creating and training the model
      model = RandomForestRegressor(n_estimators=100, random_state=42)
      model.fit(X_train, y_train)

      # Predicting on test set
      y_pred = model.predict(X_test)

      # Calculating accuracy metrics
      mae = mean_absolute_error(y_test, y_pred)
      mse = mean_squared_error(y_test, y_pred)
      rmse = np.sqrt(mse)
      r2 = r2_score(y_test, y_pred)

      print("Model Evaluation Metrics:")
      print(f"MAE: {mae}")
      print(f"MSE: {mse}")
      print(f"RMSE: {rmse}")
      print(f"R² Score: {r2 * 100:.2f}%")

      # Function for user input prediction
      def predict_production(area, seeds, ph, water, investment):
          input_data = np.array([[area, seeds, ph, water, investment]])
          prediction = model.predict(input_data)
          return prediction[0]

      # User input
      area = float(input("Enter area (bigha): "))
      min_seeds, max_seeds = area * 8, area * 10
      min_water, max_water = area * 755000, area * 830000
      min_investment, max_investment = area * 8000, area * 10000

      seeds = float(input(f"Enter seeds shown ({min_seeds}kg - {max_seeds}kg): "))
      if not (min_seeds <= seeds <= max_seeds):
          sys.exit("Invalid seeds input.")

      ph = float(input("Enter current Ph Level: "))
      if not (5.0 <= ph <= 8.5):
          sys.exit("Soil not suitable.")
      ph_damage_percentage = max(0, (ph - 6.5) * 8)

      water = float(input(f"Enter water given ({min_water}ml - {max_water}ml): "))
      if not (min_water <= water <= max_water):
          sys.exit("Invalid water input.")

      investment = float(input(f"Enter investment ({min_investment}Rs - {max_investment}Rs): "))
      if not (min_investment <= investment <= max_investment):
          sys.exit("Invalid investment input.")

      predicted_production = predict_production(area, seeds, ph, water, investment)
      print(f"Predicted Production: {predicted_production}")

      # External impact inputs
      flood_impact = float(input("Enter Flood Impact (%): "))
      disease_impact = float(input("Enter Disease Impact (%): "))
      temperature = float(input("Enter current temperature: "))
      if not (22 <= temperature < 40):
          sys.exit("Temperature not suitable.")
      temp_damage_percentage = max(0, (temperature - 35) * 8.1)

      other_damage_impact = float(input("Enter Extra Damage Impact (%): "))

      total_impact = flood_impact + disease_impact + temp_damage_percentage + other_damage_impact + ph_damage_percentage
      net_produced = predicted_production * (1 - (total_impact / 100))
      net_wastage = predicted_production - net_produced
      print(f"Net Produced: {net_produced:.2f}kg")
      print(f"Net Wastage: {net_wastage:.2f}kg")

      # Feature Ranking
      ideal_values = {
          "area (bigha)": (5.5, 6.5),
          "seeds_shown": (25, 35),
          "water (ml)": (700000, 900000),
          "PH level": (5.3, 6.5),
          "temperature": (25, 35)
      }

      def calculate_deviation(user_input):
          return {f: max(0, abs(user_input[f] - ideal_values[f][0]) / ideal_values[f][0] * 100) if user_input[f] < ideal_values[f][0] else max(0, abs(user_input[f] - ideal_values[f][1]) / ideal_values[f][1] * 100) for f in user_input}

      def train_model(df):
          X = df[['area (bigha)', 'seeds_shown', 'PH level', 'water (ml)', 'Investment/bigha']]
          df['net_wastage'] = df['Production'] * (total_impact / 100)
          y = df['net_wastage']
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
          model = RandomForestRegressor(n_estimators=100, random_state=42)
          model.fit(X_train, y_train)
          return model, X_train

      ml_model, X_train = train_model(df)

      def rank_features(model, X_train, user_input):
          explainer = shap.TreeExplainer(model)
          shap_values = explainer.shap_values(X_train)
          if isinstance(shap_values, list):
              shap_values = shap_values[0]
          feature_importance = np.abs(shap_values).mean(axis=0)
          feature_importance_dict = dict(zip(X_train.columns, feature_importance))
          deviation_scores = calculate_deviation(user_input)
          ranked_features = sorted({f: deviation_scores.get(f, 1) * feature_importance_dict.get(f, 0) for f in X_train.columns}.items(), key=lambda x: x[1], reverse=True)
          return ranked_features

      user_input = {"area (bigha)": area, "seeds_shown": seeds, "PH level": ph, "water (ml)": water, "temperature": temperature}
      ranked_features = rank_features(ml_model, X_train, user_input)
      print("Feature Ranking:")
      for rank, (feature, score) in enumerate(ranked_features, start=1):
          print(f"{rank}. {feature} (Impact Score: {score:.2f})")
    elif choice == "2":
        # Load dataset
      df = pd.read_csv("tomato.csv")  # Replace with actual file path

      # Selecting features and target
      X = df[['area (bigha)', 'seeds_shown', 'PH level', 'water (ml)', 'Investment/bigha']]
      y = df['Production']

      # Splitting data into train and test sets
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

      # Creating and training the model
      model = RandomForestRegressor(n_estimators=100, random_state=42)
      model.fit(X_train, y_train)

      # Predicting on test set
      y_pred = model.predict(X_test)

      # Calculating accuracy metrics
      mae = mean_absolute_error(y_test, y_pred)
      mse = mean_squared_error(y_test, y_pred)
      rmse = np.sqrt(mse)
      r2 = r2_score(y_test, y_pred)

      print("Model Evaluation Metrics:")
      print(f"MAE: {mae}")
      print(f"MSE: {mse}")
      print(f"RMSE: {rmse}")
      print(f"R² Score: {r2 * 100:.2f}%")

      # Function for user input prediction
      def predict_production(area, seeds, ph, water, investment):
          input_data = np.array([[area, seeds, ph, water, investment]])
          prediction = model.predict(input_data)
          return prediction[0]

      # User input
      area = float(input("Enter area (bigha): "))
      min_seeds, max_seeds = area * 17, area * 27
      min_water, max_water = area * 4000000, area * 6000000
      min_investment, max_investment = area * 50000, area * 65000

      seeds = float(input(f"Enter seeds shown ({min_seeds}kg - {max_seeds}kg): "))
      if not (min_seeds <= seeds <= max_seeds):
          sys.exit("Invalid seeds input.")

      ph = float(input("Enter current Ph Level: "))
      if not (5.0 <= ph <= 7.5):
          sys.exit("Soil not suitable.")
      ph_damage_percentage = max(0, (ph - 6.8) * 8)

      water = float(input(f"Enter water given ({min_water}ml - {max_water}ml): "))
      if not (min_water <= water <= max_water):
          sys.exit("Invalid water input.")

      investment = float(input(f"Enter investment ({min_investment}Rs - {max_investment}Rs): "))
      if not (min_investment <= investment <= max_investment):
          sys.exit("Invalid investment input.")

      predicted_production = predict_production(area, seeds, ph, water, investment)
      print(f"Predicted Production: {predicted_production}")

      # External impact inputs
      flood_impact = float(input("Enter Flood Impact (%): "))
      disease_impact = float(input("Enter Disease Impact (%): "))
      temperature = float(input("Enter current temperature: "))
      if not (22 <= temperature < 40):
          sys.exit("Temperature not suitable.")
      temp_damage_percentage = max(0, (temperature - 35) * 8.1)

      other_damage_impact = float(input("Enter Extra Damage Impact (%): "))

      total_impact = flood_impact + disease_impact + temp_damage_percentage + other_damage_impact + ph_damage_percentage
      net_produced = predicted_production * (1 - (total_impact / 100))
      net_wastage = predicted_production - net_produced
      print(f"Net Produced: {net_produced:.2f}kg")
      print(f"Net Wastage: {net_wastage:.2f}kg")

      # Feature Ranking
      ideal_values = {
          "area (bigha)": (5.5, 6.5),
          "seeds_shown": (17, 27),
          "water (ml)": (4000000 , 6000000 ),
          "PH level": (6.0, 6.8),
          "temperature": (25, 35)
      }

      def calculate_deviation(user_input):
          return {f: max(0, abs(user_input[f] - ideal_values[f][0]) / ideal_values[f][0] * 100) if user_input[f] < ideal_values[f][0] else max(0, abs(user_input[f] - ideal_values[f][1]) / ideal_values[f][1] * 100) for f in user_input}

      def train_model(df):
          X = df[['area (bigha)', 'seeds_shown', 'PH level', 'water (ml)', 'Investment/bigha']]
          df['net_wastage'] = df['Production'] * (total_impact / 100)
          y = df['net_wastage']
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
          model = RandomForestRegressor(n_estimators=100, random_state=42)
          model.fit(X_train, y_train)
          return model, X_train

      ml_model, X_train = train_model(df)

      def rank_features(model, X_train, user_input):
          explainer = shap.TreeExplainer(model)
          shap_values = explainer.shap_values(X_train)
          if isinstance(shap_values, list):
              shap_values = shap_values[0]
          feature_importance = np.abs(shap_values).mean(axis=0)
          feature_importance_dict = dict(zip(X_train.columns, feature_importance))
          deviation_scores = calculate_deviation(user_input)
          ranked_features = sorted({f: deviation_scores.get(f, 1) * feature_importance_dict.get(f, 0) for f in X_train.columns}.items(), key=lambda x: x[1], reverse=True)
          return ranked_features

      user_input = {"area (bigha)": area, "seeds_shown": seeds, "PH level": ph, "water (ml)": water, "temperature": temperature}
      ranked_features = rank_features(ml_model, X_train, user_input)
      print("Feature Ranking:")
      for rank, (feature, score) in enumerate(ranked_features, start=1):
          print(f"{rank}. {feature} (Impact Score: {score:.2f})")
    else:
        print("Invalid choice. Please select 1 or 2.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load and train rice model
df = pd.read_csv("data69.csv")
X = df[['area (bigha)', 'seeds_shown', 'PH level', 'water (ml)', 'Investment/bigha']]
y = df['Production']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rice_model = RandomForestRegressor(n_estimators=100, random_state=42)
rice_model.fit(X_train, y_train)

# Save the model
with open('rice_model.pkl', 'wb') as f:
    pickle.dump(rice_model, f)