In [None]:
import numpy as np
import pandas as pd


In [None]:
df = pd.read_csv('crop_production.csv')

In [None]:
df = df.rename(columns=lambda x: x.replace(' ', ''))
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#checking the states names to find in if their is a data mismatch
for state in df["State"].unique():
    if state not in state_names:
        print("State name not found:", state)

In [None]:
state_names

In order to address the multiple issues with the state names, the following steps will be taken:

1.  Replace "and" with "&" in the state names.
2.  Convert "CHANDIGARH" to lowercase for consistency.
3.  Correct the misspelled state name "Laddak" to "Ladakh".
4.  Merge "Dadra and Nagar Haveli" and "Daman and Diu" into a single region.

In [None]:
# Before merging "Dadra and Nagar Haveli" and "Daman and Diu" into one territory, it is important to ensure that the districts clearly separate them.
print(df[df['State'] =="Daman and Diu"].head(30))
print(df[df['State'] =="Dadra and Nagar Haveli"].head())

In [None]:
df['State'] = df['State'].str.replace(' and ', ' & ')
state_replacements = {
    'Andaman & Nicobar Island': 'Andaman & Nicobar',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'THE DADRA AND NAGAR HAVELI': 'Dadra and Nagar Haveli and Daman and Diu',
    'Laddak': 'Ladakh',
    'CHANDIGARH' :'Chandigarh'
}
df['State'] = df['State'].replace(state_replacements)

In [None]:
#checking again for data mismatch
for state in df["State"].unique():
    if state not in state_names:
        print("State name not found:", state)

In [None]:
df.isnull().sum()

In [None]:
df[df.Crop.isna()]

In [None]:
df.Crop.isna().sum()/len(df)

In [None]:
df.dropna(subset=['Crop'], inplace=True)

In [None]:
(df.Production.isna().sum()/len(df))*100

In [None]:
production_median = df['Production'].median()
df['Production'].fillna(production_median, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
cat = ['State', 'District', 'Crop', 'Season']
num = ['Crop_Year' ,'Area', 'Production', 'Yield']

In [None]:
from plotly.offline import  init_notebook_mode, iplot
init_notebook_mode(connected = True)

In [None]:
import plotly.express as px
for att in cat:
    fig = px.histogram(df, x=att, title=att,color=att)
    fig.update_layout(height=600,xaxis_title='', yaxis_title='Count',showlegend=False)
    fig.show()

let's also visualizing the district counts by state, it will give us an overview of how the districts are distributed across different states. This can help us identify states with a higher number of districts and states with fewer districts.

In [None]:
state_counts = df.groupby('State')['District'].nunique().reset_index()
fig = px.bar(state_counts, x='State', y='District', color='State',
             title='District Counts by State', labels={'State': 'State', 'District': 'District Count'})
fig.update_layout(height=600, showlegend=False)
fig.show()


In [None]:
for att in num:
    fig = px.histogram(df, x=att)
    fig.update_layout(title=att, title_font_size=20)
    fig.update_layout(height=400, showlegend=False, xaxis_title='')
    fig.show()
    fig = px.box(df, x=att, orientation='h')
    fig.update_layout(height=400, showlegend=False, xaxis_title='')
    fig.show()


In [None]:
crop_counts = df.groupby(['State', 'Crop']).size().reset_index(name='Count')
most_common_crop = crop_counts.groupby('State').apply(lambda x: x.loc[x['Count'].idxmax()]).reset_index(drop=True)

In [None]:
most_common_crop

In [None]:
fig = px.choropleth(
    most_common_crop,
    geojson=geojson_url,
    featureidkey='properties.ST_NM',
    locations='State',
    color='Crop'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(
    title_text='Most Commonly Grown Crops in India by State',
    title_x=0.5,
    title_y=0.9,
    title_font=dict(size=24),
    margin={'l': 0, 'r': 0, 't': 30, 'b': 0},
    height=600,
    width=1000,
    autosize=True,
)
fig.show()

In [None]:
season_production = df.groupby(['Crop', 'Season'])['Production'].sum().reset_index()

In [None]:
season_list =season_production.Season.unique()

In [None]:
# Filter the data for the winter season
for i in season_list:
  mask =season_production[season_production.Season == i]
  mask=mask.sort_values('Production', ascending=False)
  top_crops = mask.head(10)
  fig = px.bar(top_crops, x='Crop', y='Production', title= 'Highest Crop Production in {}'.format(i))
  fig.show()

In [None]:
state_production = df.groupby(['State','Season'])['Production'].sum().reset_index()

In [None]:
fig = px.choropleth(
    state_production[state_production.Season=='Whole Year '],
    geojson=geojson_url,
    featureidkey='properties.ST_NM',
    locations='State',
    color='Production',color_continuous_scale= 'YlOrRd'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(
    title_text='production throughout the whole year by state',
    title_x=0.5,
    title_y=0.9,
    title_font=dict(size=24),
    margin={'l': 0, 'r': 0, 't': 30, 'b': 0},
    height=600,
    width=1000,
    autosize=True,
)

# Display the interactive map
fig.show()


In [None]:
fig = px.choropleth(
    state_production,
    geojson=geojson_url,
    featureidkey='properties.ST_NM',
    locations='State',
    animation_frame='Season',
    color='Production',color_continuous_scale= 'YlOrRd'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(
    title_text='Production for each season by State',
    title_x=0.5,
    title_y=0.9,
    title_font=dict(size=24),
    margin={'l': 0, 'r': 0, 't':120, 'b': 0},
    height=500,
    width=800,
    autosize=True,
)

# Display the interactive map
fig.show()

In [None]:
crop_seasons = df[df['Season'] != 'Whole Year ']
season_production = crop_seasons.groupby('Season')['Production'].sum().reset_index()
fig = px.pie(season_production, values='Production', names='Season', title='Percentage of Production for Crops in Each Season')
fig.show()


In [None]:
state_production_per_year = df.groupby(['State', 'Crop_Year'])['Production'].sum().reset_index()
state_production_per_year = state_production_per_year.sort_values('Crop_Year')

In [None]:
fig = px.choropleth(
    state_production_per_year,
    geojson=geojson_url,
    featureidkey='properties.ST_NM',
    locations='State',
    animation_frame='Crop_Year',
    color='Production',
    color_continuous_scale='YlOrRd'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(
    title_text='Crop Production Over the Years by State',
    title_x=0.5,
    title_y=0.9,
    title_font=dict(size=24),
    margin={'l': 0, 'r': 0, 't': 30, 'b': 0},
    height=600,
    width=800,
    autosize=True
)

# Display the interactive map with animation
fig.show()


In [None]:
fig = px.line(state_production_per_year, x='Crop_Year', y='Production', color='State',
              title='Production Changes in Each State Over the Years')
fig.show()


In [None]:
yearly_production = df.groupby('Crop_Year')['Production'].sum().reset_index()
max_production_year = yearly_production.loc[yearly_production['Production'].idxmax(), 'Crop_Year']
fig = px.bar(yearly_production, x='Crop_Year', y='Production',
             title='Overall Production by Year')
fig.update_layout(xaxis_title='Year', yaxis_title='Production')
fig.add_annotation(x=max_production_year, y=yearly_production['Production'].max(),
                   text='Max Production Year', showarrow=True, arrowhead=1)
fig.show()
