In [2]:
%pip install pandas scikit-learn
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("India Agriculture Crop Production.csv")
print(data.head())
# Drop rows with missing values
data = data.dropna()

# Encode categorical variables
le_state = LabelEncoder()
le_district = LabelEncoder()
le_season = LabelEncoder()
le_crop = LabelEncoder()

data['State'] = le_state.fit_transform(data['State'])
data['District'] = le_district.fit_transform(data['District'])
data['Season'] = le_season.fit_transform(data['Season'])
data['Crop'] = le_crop.fit_transform(data['Crop'])

# Normalize the 'Area' column
scaler = MinMaxScaler()
data['Area'] = scaler.fit_transform(data[['Area']])

# Define features (X) and target (y)
X = data[['State', 'District', 'Season', 'Area']]
y = data['Yield']  # Target is the yield

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Note: you may need to restart the kernel to use updated packages.
                         State                  District      Crop     Year  \
0  Andaman and Nicobar Islands                  NICOBARS  Arecanut  2001-02   
1  Andaman and Nicobar Islands                  NICOBARS  Arecanut  2002-03   
2  Andaman and Nicobar Islands                  NICOBARS  Arecanut  2003-04   
3  Andaman and Nicobar Islands  NORTH AND MIDDLE ANDAMAN  Arecanut  2001-02   
4  Andaman and Nicobar Islands            SOUTH ANDAMANS  Arecanut  2002-03   

       Season    Area Area Units  Production Production Units     Yield  
0      Kharif  1254.0    Hectare      2061.0           Tonnes  1.643541  
1  Whole Year  1258.0    Hectare      2083.0           Tonnes  1.655803  
2  Whole Year  1261.0    Hectare      1525.0           Tonnes  1.209358  
3      Kharif  3100.0    Hectare      5239.0           Tonnes  1.690000  
4  Whole Year  3105.0    Hectare      5267.0           Tonnes  1.696296  


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Squared Error: 422537.2961077155
R2 Score: 0.47479903257061706


In [None]:
def recommend_crop(state, district, season, area):
    # Create a DataFrame for all crops
    crops = data['Crop'].unique()
    input_data = pd.DataFrame({
        'State': [state] * len(crops),
        'District': [district] * len(crops),
        'Season': [season] * len(crops),
        'Area': [area] * len(crops),
        'Crop': crops
    })

    # Predict yield for all crops
    input_data['Area'] = scaler.transform(input_data[['Area']])  # Normalize the area
    input_data['Yield'] = model.predict(input_data[['State', 'District', 'Season', 'Area']])

    # Decode crop names
    input_data['Crop'] = le_crop.inverse_transform(input_data['Crop'])

    # Sort by yield and return the top crop
    recommended_crop = input_data.sort_values(by='Yield', ascending=False).iloc[0]
    return recommended_crop['Crop'], recommended_crop['Yield']

# Example usage
state = le_state.transform(['Karnataka'])[0]  # Replace with a valid state from your dataset
district = le_district.transform(['BANGALORE RURAL'])[0]  # Replace with a valid district
season = le_season.transform(['Rabi'])[0]  # Replace with a valid season
area = 10.0  # Example area in hectares

crop, yield_value = recommend_crop(state, district, season, area)
print(f"Recommended Crop: {crop}, Predicted Yield: {yield_value:.2f} tonnes/hectare")
# print(data['State'].unique())  # List of valid states
# print(data['District'].unique())  # List of valid districts
# print(data['Season'].unique())  # List of valid seasons

Recommended Crop: Arecanut, Predicted Yield: 0.85 tonnes/hectare
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24
 25 26 27 28 29 30 32 33 34 35 31 18]


In [11]:
print(le_state.inverse_transform([state]))  # Should return 'Karnataka'
print(le_district.inverse_transform([district]))  # Should return 'BANGALORE RURAL'
print(le_season.inverse_transform([season]))  # Should return 'Rabi'
# print(data['Crop'].value_counts()) 
# # Check the distribution of crops in the dataset
actual_data = data[
    (data['State'] == state) &
    (data['District'] == district) &
    (data['Season'] == season) &
    (data['Crop'] == le_crop.transform(['Arecanut'])[0])
]
print(actual_data[['Area', 'Yield']])  # Compare actual yield values

['Karnataka']
['BANGALORE RURAL']
['Rabi']
Empty DataFrame
Columns: [Area, Yield]
Index: []


In [13]:
print(data.groupby(['State', 'District', 'Season', 'Crop']).size())

State  District  Season  Crop
0      38        0       0        2
                         3        2
                         5        2
                         41       2
                         47       2
                                 ..
35     531       4       51       6
                         52      12
                 5       38       2
                         41      23
                         44      13
Length: 35159, dtype: int64
