In [1]:
import pandas as pd

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout

In [3]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
# Step 1: Data Exploration and Preprocessing
# Load the Data
data = pd.read_csv('India Agriculture Crop Production.csv')

# Data Cleaning: Handle missing data
data['Crop'].fillna(data['Crop'].mode()[0], inplace=True)
data['Season'].fillna(data['Season'].mode()[0], inplace=True)
data['Area'].fillna(data['Area'].median(), inplace=True)
data['Production'].fillna(data['Production'].median(), inplace=True)
data['Yield'].fillna(data['Yield'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Crop'].fillna(data['Crop'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Season'].fillna(data['Season'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [5]:
# Remove duplicates if any
data.drop_duplicates(inplace=True)

In [6]:
# Feature Selection/Engineering: Encoding categorical features
categorical_columns = ['State', 'District', 'Crop', 'Season']
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [7]:
# If you need to encode "Area Units" and "Production Units"
label_encoders['Area Units'] = LabelEncoder()
label_encoders['Production Units'] = LabelEncoder()
data['Area Units'] = label_encoders['Area Units'].fit_transform(data['Area Units'])
data['Production Units'] = label_encoders['Production Units'].fit_transform(data['Production Units'])

In [8]:
# Selecting features and target
features = data.drop(columns=['Production'])
target = data['Production']

In [9]:
# Normalization: Scale numerical features
scaler = StandardScaler()
features['Area'] = scaler.fit_transform(features[[ 'Area']])
features['Year'] = features['Year'].apply(lambda x: int(x.split('-')[0]))

In [10]:
# Step 2: Natural Language Processing (NLP) for Text Data (if applicable)
# Assuming we have text data for NLP (e.g., 'Crop' column as an example)
# Text Preprocessing and Embedding
# Skipping this step as no explicit text data is provided for NLP processing in the dataset

In [11]:
# Step 3: Model Building
# Model Architecture
model = Sequential()
model.add(Dense(128, input_dim=features.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Linear activation for regression

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Compile the Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

In [13]:
# Step 4: Training the Model
# Split the Data
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [14]:
# Fit the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])

Epoch 1/10
[1m8636/8636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 412097045331968.0000 - mae: 1247731.3750 - val_loss: 298561531019264.0000 - val_mae: 998028.1250
Epoch 2/10
[1m8636/8636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 403144588656640.0000 - mae: 1245595.3750 - val_loss: 296769858568192.0000 - val_mae: 1038055.6250
Epoch 3/10
[1m8636/8636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - loss: 389850792460288.0000 - mae: 1225008.3750 - val_loss: 295271082754048.0000 - val_mae: 962054.7500
Epoch 4/10
[1m8636/8636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - loss: 370683393605632.0000 - mae: 1170408.8750 - val_loss: 294061915242496.0000 - val_mae: 1069329.8750
Epoch 5/10
[1m8636/8636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 391346212831232.0000 - mae: 1245277.3750 - val_loss: 291257536479232.0000 - val_mae: 974401.5000
Epoch 6/10
[1m8636/8636[0m

In [16]:
# Step 6: Prediction
# Make Predictions on validation data
predictions = model.predict(X_val)

[1m2159/2159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 619us/step


In [19]:
# Step 7: Deployment
# Save the Model
model.save('crop_yield_prediction_model.h5')
import joblib

joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')

# The model is now ready for deployment in a production environment.



['scaler.pkl']