In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle

def preprocess_data(data):
    try:
        # Drop rows with missing production values and reset the index
        data = data.dropna(subset=['Production']).reset_index(drop=True)

        # Apply logarithmic transformation to 'Area' and 'Production'
        data['Area'] = data['Area'].apply(lambda x: np.log(x + 1))
        data['Production'] = data['Production'].apply(lambda x: np.log(x + 1))

        # Apply one-hot encoding to categorical columns
        encoder = OneHotEncoder(drop='first', sparse=False)
        encoded_features = encoder.fit_transform(data[['State_Name', 'Season', 'Crop']])
        encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['State_Name', 'Season', 'Crop']))
        
        # Serialize the fitted encoder
        # encoder_path = "encoder.pkl"
        # with open(encoder_path, "wb") as encoder_file:
        #     pickle.dump(encoder, encoder_file)
            
        # Concatenate the original dataframe with the encoded dataframe and drop original categorical columns
        data = pd.concat([data, encoded_df], axis=1)
        data.drop(columns=['State_Name', 'Season', 'Crop'], inplace=True)
    
        return data
        # [Existing preprocessing code]
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None
    

crop_data = pd.read_csv("C:/Users/nitya/Documents/DataScience/CropProduction/crop_production.csv")

crop_data['State_Name'] = crop_data['State_Name'].str.strip()
crop_data['District_Name'] = crop_data['District_Name'].str.strip()
crop_data['Season'] = crop_data['Season'].str.strip()
crop_data['Crop'] = crop_data['Crop'].str.strip()

# Refit the encoder with the cleaned data
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(crop_data[['State_Name', 'Season', 'Crop']])

# Serialize the refitted encoder
encoder_path = "encoder.pkl"
with open(encoder_path, "wb") as encoder_file:
    pickle.dump(encoder, encoder_file)

preprocessed_data = preprocess_data(crop_data)

# Remove 'District_Name' and 'Production' from the columns
final_columns = preprocessed_data.columns.drop(['District_Name', 'Production']).tolist()

# Serialize the column names from the preprocessed data without 'District_Name' and 'Production'
columns_path = "columns.pkl"
with open(columns_path, "wb") as columns_file:
    pickle.dump(final_columns, columns_file)




In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def train_model(data):
    # Split data into features and target
    X = data.drop(columns=['District_Name', 'Production'])
    y = data['Production']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Random Forest Regressor
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_regressor.fit(X_train, y_train)

    # Evaluate the model's performance
    y_pred = rf_regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error: {rmse}")

    return rf_regressor

rf_model = train_model(preprocessed_data)


Root Mean Squared Error: 0.51983150742318


In [7]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Predict on the entire preprocessed data
# y_pred_all = rf_model.predict(preprocessed_data.drop(columns=['District_Name', 'Production']))

# # Apply inverse transformation to get actual prediction values
# predicted_production_all = np.exp(y_pred_all) - 1

# # Plotting for the entire dataset
# plt.figure(figsize=(10, 6))
# plt.scatter(crop_data['Production'], predicted_production_all, alpha=0.3)
# plt.plot([crop_data['Production'].min(), crop_data['Production'].max()], 
#          [crop_data['Production'].min(), crop_data['Production'].max()], 'k--', lw=3, color='red')
# plt.xlabel('Actual Production')
# plt.ylabel('Predicted Production')
# plt.title('Actual vs. Predicted Production (Entire Dataset)')
# plt.show()



In [8]:
def predict_production(model, sample_data, training_columns=None):
    # Use the preprocess_data function to preprocess the sample data
    sample_data_encoded = preprocess_data(sample_data)
    
    # Ensure sample data has the same columns as training data
    if training_columns is not None:
        missing_cols = set(training_columns) - set(sample_data_encoded.columns)
        for col in missing_cols:
            sample_data_encoded[col] = 0
        sample_data_encoded = sample_data_encoded[training_columns]
    
    # Predict production using the trained model
    predicted_production_log = model.predict(sample_data_encoded.drop(columns=['District_Name', 'Production']))
    
    # Apply inverse transformation to get actual values
    predicted_production = np.exp(predicted_production_log) - 1
    return predicted_production

sample_data = pd.DataFrame({
    'State_Name': ['Odisha'],
    'District_Name': ['Dhenkanal'],
    'Crop_Year': [2023],
    'Season': ['Kharif'],
    'Crop': ['Rice'],
    'Area': [10],
    'Production': [0]  # Placeholder value; will be ignored in the prediction
})

predicted_value = predict_production(rf_model, sample_data, training_columns=preprocessed_data.columns)
print(predicted_value)


[8.06492331]


  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample_data_encoded[col] = 0
  sample

In [9]:
import pickle


# Serialize the model
with open("trained_model.pkl", "wb") as model_file:
    pickle.dump(rf_model, model_file)



In [10]:
# Load the trained_columns from the serialized file
with open("columns.pkl", 'rb') as columns_file:
    trained_columns = pickle.load(columns_file)

# Extract columns from the preprocessed sample data (from your Flask log)
sample_data_columns = ["District_Name", "Crop_Year", "Area", "State_Name_Andhra Pradesh", "Crop_Yam", 
                       "Crop_other fibres", "Crop_other misc. pulses", "Crop_other oilseeds"]

# Identify missing and extra columns
missing_columns = [col for col in trained_columns if col not in sample_data_columns]
extra_columns = [col for col in sample_data_columns if col not in trained_columns]

print("Missing Columns:", missing_columns)
print("Extra Columns:", extra_columns)


Missing Columns: ['Production', 'State_Name_Arunachal Pradesh', 'State_Name_Assam', 'State_Name_Bihar', 'State_Name_Chandigarh', 'State_Name_Chhattisgarh', 'State_Name_Dadra and Nagar Haveli', 'State_Name_Goa', 'State_Name_Gujarat', 'State_Name_Haryana', 'State_Name_Himachal Pradesh', 'State_Name_Jammu and Kashmir', 'State_Name_Jharkhand', 'State_Name_Karnataka', 'State_Name_Kerala', 'State_Name_Madhya Pradesh', 'State_Name_Maharashtra', 'State_Name_Manipur', 'State_Name_Meghalaya', 'State_Name_Mizoram', 'State_Name_Nagaland', 'State_Name_Odisha', 'State_Name_Puducherry', 'State_Name_Punjab', 'State_Name_Rajasthan', 'State_Name_Sikkim', 'State_Name_Tamil Nadu', 'State_Name_Telangana', 'State_Name_Tripura', 'State_Name_Uttar Pradesh', 'State_Name_Uttarakhand', 'State_Name_West Bengal', 'Season_Kharif', 'Season_Rabi', 'Season_Summer', 'Season_Whole Year', 'Season_Winter', 'Crop_Arcanut (Processed)', 'Crop_Arecanut', 'Crop_Arhar/Tur', 'Crop_Ash Gourd', 'Crop_Atcanut (Raw)', 'Crop_Bajra', 