In [1]:
%cd /content/drive/MyDrive/Agriculture App/agriculture-predictor-planner

/content/drive/MyDrive/Agriculture App/agriculture-predictor-planner


In [2]:
import pandas as pd
import numpy as np

In [3]:
#Loading the dataframe
crop_data = pd.read_csv('data/merged/crop_soil_weather_merged.csv')

In [4]:
#Converting data types so that district code, year, and state code are not treated as numeric during calculations
crop_data = crop_data.astype({'Dist Code':'category', 'Year':'int64', 'State Code':'category', 'State Name':'category', 'Dist Name':'category'})


In [5]:
# Filter out all the crop yield columns
yield_columns = [col for col in crop_data.columns if "YIELD (Kg per ha)" in col]

#Removing the outlier
yield_columns.remove('SUGARCANE YIELD (Kg per ha)')


In [7]:
# Reshape the dataframe from wide to long format where each row corresponds to a specific district, year, and crop yield
df_long = crop_data.melt(id_vars=['Dist Name', 'Year'], value_vars=yield_columns, var_name='Crop', value_name='Yield')


In [None]:
# Compute the average yield for each District and Crop pair
df_avg = df_long.groupby(['Dist Name', 'Crop'])['Yield'].mean().reset_index()

# For each district, get the top 5 crops based on the average yield
top5_per_district = df_avg.groupby('Dist Name', group_keys=False).apply(
        lambda group: group.nlargest(5, 'Yield'))

print(top5_per_district.head(20))



In [None]:
# Resetting the index to a sequential format
top5_per_district.reset_index(drop=True, inplace=True)
print(top5_per_district.head(10))

In [None]:
#remove "YIELD (Kg per ha)" from every crop name
top5_per_district["Crop"] = (
    top5_per_district["Crop"]
    .str.replace(r"\s*YIELD \(Kg per ha\)", "", regex=True)
    .str.strip()
)
print(top5_per_district.head(10))

In [None]:
#Saving the dataframe
file_save_path = 'data/final/top5crops_per_district.csv'
top5_per_district.to_csv(file_save_path, index=False)
print(f"Top five crops per district data saved to {file_save_path}")
