In [1]:
%cd /content/drive/MyDrive/Agriculture App/agriculture-predictor-planner

/content/drive/MyDrive/Agriculture App/agriculture-predictor-planner


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
plt.clf()  # Clear the current figure.
sns.set_theme(style="whitegrid")  # Reset to a fresh theme.


In [None]:
crop_data = pd.read_csv('data/merged/crop_soil_weather_merged.csv')


In [None]:
pd.set_option('display.max_columns', None)
crop_data.info(verbose=True)

In [None]:
#Converting data types so that district code, year, and state code are not treated as numeric during calculations
crop_data = crop_data.astype({'Dist Code':'category', 'Year':'int64', 'State Code':'category', 'State Name':'category', 'Dist Name':'category'})

print(crop_data.info(verbose=True))

In [None]:
crop_data.head(5)

In [6]:
crops = ['RICE', 'WHEAT', 'KHARIF SORGHUM', 'RABI SORGHUM', 'SORGHUM', 'PEARL MILLET', 'MAIZE', 'FINGER MILLET', 'BARLEY', 'CHICKPEA', 'PIGEONPEA', 'MINOR PULSES', 'GROUNDNUT', 'SESAMUM', 'RAPESEED AND MUSTARD', 'SAFFLOWER', 'CASTOR', 'SUNFLOWER', 'SOYABEAN', 'OILSEEDS', 'SUGARCANE', 'COTTON']


1. Visualising crop yield data through histograms

In [None]:
# Set up a grid of subplots. Adjust n_cols and figsize as needed.
n_cols = 2
n_rows = (len(crops) + n_cols - 1) // n_cols
plt.figure(figsize=(20, n_rows * 8))

# Loop over each crop to create a histogram for its yield.
for i, crop in enumerate(crops, 1):
    col_name = f"{crop} YIELD (Kg per ha)"
    plt.subplot(n_rows, n_cols, i)
    # plot a histogram with KDE(Kernel Density Estimation) for a smooth distribution.
    sns.histplot(crop_data[col_name], bins=30, kde=True, edgecolor='black')
    plt.title(f"{crop} Yield Distribution")
    plt.xlabel("Yield (Kg per ha)")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

2. Visualising crop yield data over the years

In [None]:
# Filter out all the crop yield columns
yield_columns = [col for col in crop_data.columns if "YIELD (Kg per ha)" in col]

# Group data by 'Year' and calculate the average yield for each crop
yearly_yields = crop_data.groupby('Year')[yield_columns].mean().reset_index()

plt.figure(figsize=(30, 15))

for col in yield_columns:
    # Extract the crop name from the column
    crop_name = col.split(" YIELD")[0]
    sns.lineplot(data=yearly_yields, x='Year', y=col, marker='o', label=crop_name)

plt.title("Average Crop Yields Over the Years")
plt.xlabel("Year")
plt.ylabel("Yield (Kg per ha)")
plt.legend(title="Crop")
plt.grid(True)
plt.show()





3. Visualising crops yearly trends excluding the sugarcane(outlier)

In [None]:
# Filter out all the crop yield columns
yield_columns = [col for col in crop_data.columns if "YIELD (Kg per ha)" in col]

# Filter out 'SUGARCANE YIELD (Kg per ha)' from yield_columns
yield_columns = [col for col in yield_columns if col not in ['SUGARCANE YIELD (Kg per ha)', 'KHARIF SORGHUM YIELD (Kg per ha)', 'RABI SORGHUM YIELD (Kg per ha)']]
print(yield_columns)

# Group data by 'Year' and calculate the average yield for each crop (excluding Sugarcane)
yearly_yields = crop_data.groupby('Year')[yield_columns].mean().reset_index()

plt.figure(figsize=(20, 15))
colors = sns.color_palette("tab20", n_colors=len(yield_columns))

for i, col in enumerate(yield_columns):
    # Extract the crop name from the column
    crop_name = col.split(" YIELD")[0]
    # Plot the lineplot for the current crop
    sns.lineplot(data=yearly_yields, x='Year', y=col, marker='o', color=colors[i], label=crop_name)

plt.title("Average Crop Yields Over the Years (Excluding Sugarcane)")
plt.xlabel("Year")
plt.ylabel("Yield (Kg per ha)")
plt.legend(title="Crop")
plt.grid(True)
plt.show()


3. Visualising distribution of crop yields across states

In [None]:
# Identify all yield columns (those that contain "YIELD (Kg per ha)")
yield_columns = [col for col in crop_data.columns if "YIELD (Kg per ha)" in col]



# Melt the DataFrame into a long format so each row represents a state, crop, and its yield.
# Adjust the state column name if needed.
df_long = crop_data.melt(id_vars=['State Name'], value_vars=yield_columns,
                  var_name='Crop', value_name='Yield')

# Clean up the crop names by extracting, e.g., "RICE" from "RICE YIELD (Kg per ha)"
df_long['Crop'] = df_long['Crop'].str.split(" YIELD").str[0]


#List of sorted states name
states = sorted(df_long['State Name'].unique())


# Determine the number of states, and create one subplot per state.
num_states = len(states)
fig, axes = plt.subplots(num_states, 1, figsize=(15, 8 * num_states), sharex=False)


# Loop through each state and create a boxplot
for ax, state in zip(axes, states):
    # Filter data for the current state
    state_data = df_long[df_long['State Name'] == state]

    # Create the boxplot for different crops in this state
    sns.boxplot(x='Crop', y='Yield', hue='Crop', data=state_data, ax=ax, palette="Set3", dodge=False)

    unique_crops = state_data['Crop'].unique()
    ax.set_xticks(range(len(unique_crops)))
    ax.set_xticklabels(unique_crops, rotation=45, ha='right')

    ax.set_title(f"Crop Yield Distribution in {state}", fontweight="bold", fontsize=16, fontname='DejaVu Sans')
    ax.set_ylabel("Yield (Kg per ha)", fontweight="bold")
    ax.set_xlabel("Crop", fontweight="bold")


plt.tight_layout()
plt.show()


4. Statewise fertiliser consumption

In [None]:
#Identifying the fertiliser columns
fert_columns = [ 'NITROGEN CONSUMPTION (tons)', 'PHOSPHATE CONSUMPTION (tons)', 'POTASH CONSUMPTION (tons)']

# Melt the DataFrame into a long format so each row represents a state, fertiliser, and its consumption.
df_long = crop_data.melt(id_vars=['State Name'], value_vars=fert_columns,
                  var_name='Fertiliser', value_name='Consumption')

# Clean up the fertiliser names by eliminating 'CONSUMPTION (tons)'
df_long['Fertiliser'] = df_long['Fertiliser'].str.split(" CONSUMPTION").str[0]


#List of sorted states name
states = sorted(df_long['State Name'].unique())


# Determine the number of states, and create one subplot per state.
num_states = len(states)
fig, axes = plt.subplots(num_states, 1, figsize=(8, 5 * num_states), sharex=False, sharey=True)


# Loop through each state and create a boxplot
for ax, state in zip(axes, states):
    # Filter data for the current state
    state_data = df_long[df_long['State Name'] == state]

    # Create the boxplot for different fertiliser in this state
    sns.boxplot(x='Fertiliser', y='Consumption', hue='Fertiliser', data=state_data, ax=ax, palette="Set3", dodge=False)

    ax.set_xticks(range(len(fert_columns)))
    ax.set_xticklabels(df_long['Fertiliser'].unique(), rotation=45, ha='right')

    ax.set_title(f"Fertiliser Consumption in {state}", fontweight="bold", fontsize=16, fontname='Liberation Mono')
    ax.set_ylabel("Consumption (tons)", fontweight="bold")
    ax.set_xlabel("Fertiliser", fontweight="bold")


plt.tight_layout()
plt.show()




5. Creating interactive yearly yield chart

In [None]:
plt.figure(figsize=(20, 15))
colors = sns.color_palette("tab20", n_colors=len(yield_columns))

fig = px.line(df_long, x="Year", y="Yield", color="Crop", markers=True, title="Interactive Crop Yields Over Years")
fig.show()


In [None]:
# Assume df_long is a long-form DataFrame with columns: 'Year', 'Crop', 'Yield'
# If you haven't converted it, you can create it as shown in previous examples:
crops = ['RICE', 'WHEAT', 'MAIZE', 'CHICKPEA',  'GROUNDNUT', 'SESAMUM', 'COTTON']
yield_columns = [f"{crop} YIELD (Kg per ha)" for crop in crops]

df_long = crop_data.melt(id_vars="Year", value_vars=yield_columns, var_name="Crop", value_name="Yield")
df_long["Crop"] = df_long["Crop"].apply(lambda x: x.split(" YIELD")[0])

df_agg = df_long.groupby(["Year", "Crop"], as_index=False)["Yield"].mean()
#print(df_agg.head(10))

fig = px.line(
    df_agg,
    x="Year",
    y="Yield",
    color="Crop",
    title="Interactive Crop Yields Over Years"
)

# 3. Customize the layout
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Yield (Kg per ha)",  # or any appropriate unit
    legend_title="Crop",
    template="plotly_white",         # a clean-looking background
    hovermode="x unified",           # show all tooltips at once when hovering along x
)

# 4. (Optional) Adjust line attributes for clarity
fig.update_traces(
    line={"width":2},         # set line width
    opacity=0.8               # slightly transparent to see overlap
)

fig.show()




In [29]:
import matplotlib.font_manager as fm

# Print a sorted list of all available font names
available_fonts = sorted({font.name for font in fm.fontManager.ttflist})
print(available_fonts)

['DejaVu Sans', 'DejaVu Sans Display', 'DejaVu Sans Mono', 'DejaVu Serif', 'DejaVu Serif Display', 'Humor Sans', 'Liberation Mono', 'Liberation Sans', 'Liberation Sans Narrow', 'Liberation Serif', 'STIXGeneral', 'STIXNonUnicode', 'STIXSizeFiveSym', 'STIXSizeFourSym', 'STIXSizeOneSym', 'STIXSizeThreeSym', 'STIXSizeTwoSym', 'cmb10', 'cmex10', 'cmmi10', 'cmr10', 'cmss10', 'cmsy10', 'cmtt10']


In [16]:
unique_districts = crop_data['Dist Name'].unique().tolist()
print(len(unique_districts))
print(len(crop_data['Dist Name'].unique()))

302
302


In [None]:

soil_data_list = []

# Iterate over each unique state in the crop dataset
for state in crop_data['State'].unique():
    # Extract unique districts for the current state
    districts = crop_data[crop_data['State'] == state]['Dist Name'].unique()
    print(f"\nState: {state}")

    # Iterate over each district in the current state
    for district in districts:
        valid_input = False
        while not valid_input:
            try:
                # Prompt the user to input the numeric soil type for each district
                soil_type = float(input(f"Enter numeric soil type for district '{district}' in state '{state}': "))
                valid_input = True
            except ValueError:
                print("Invalid input. Please enter a numeric value.")

        # Append the entry as a dictionary to the list
        soil_data_list.append({
            'State': state,
            'District': district,
            'Soil Type': soil_type
        })

# Convert the list of dictionaries to a Pandas DataFrame
soil_data_df = pd.DataFrame(soil_data_list)

# Display the resulting DataFrame
print("\nSoil Type Data Table:")
print(soil_data_df)



In [None]:
# Optionally, save the DataFrame to a CSV file for future reference
soil_data_df.to_csv("soil_type_by_district.csv", index=False)
print("\nSoil type data saved to 'soil_type_by_district.csv'.")
