In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium
from folium import plugins
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

: 

In [3]:
df = pd.read_csv('../Datasets/rice_data.csv')

In [None]:
# Let's add geographical coordinates for each state to enable mapping
state_coordinates = {
    'Andhra Pradesh': [15.9129, 79.7400],
    'Assam': [26.2006, 92.9376],
    'Bihar': [25.0961, 85.3131],
    'Chhattisgarh': [21.2787, 81.8661],
    'Gujarat': [23.0225, 72.5714],
    'Haryana': [29.0588, 76.0856],
    'Himachal Pradesh': [31.1048, 77.1734],
    'Jharkhand': [23.6102, 85.2799],
    'Karnataka': [15.3173, 75.7139],
    'Kerala': [10.8505, 76.2711],
    'Madhya Pradesh': [22.9734, 78.6569],
    'Maharashtra': [19.7515, 75.7139],
    'Manipur': [24.6637, 93.9063],
    'Meghalaya': [25.4670, 91.3662],
    'Mizoram': [23.1645, 92.9376],
    'Nagaland': [26.1584, 94.5624],
    'Odisha': [20.9517, 85.0985],
    'Punjab': [31.1471, 75.3412],
    'Rajasthan': [27.0238, 74.2179],
    'Sikkim': [27.5330, 88.5122],
    'Tamil Nadu': [11.1271, 78.6569],
    'Telangana': [18.1124, 79.0193],
    'Tripura': [23.9408, 91.9882],
    'Uttar Pradesh': [26.8467, 80.9462],
    'Uttarakhand': [30.0668, 79.0193],
    'West Bengal': [22.9868, 87.8550]
}

state_coords_df = pd.DataFrame(list(state_coordinates.items()), 
                               columns=['State Name', 'Coordinates'])

state_coords_df[['Latitude', 'Longitude']] = pd.DataFrame(
    state_coords_df['Coordinates'].tolist(), index=state_coords_df.index)

df_geo = df.merge(state_coords_df[['State Name', 'Latitude', 'Longitude']], 
                  on='State Name', how='left')

print("Geographical data preparation completed!")
print(f"Data shape: {df_geo.shape}")
print(f"States with coordinates: {df_geo['Latitude'].notna().sum()} / {len(df_geo)}")


In [4]:
df.info()

In [5]:
df.head(3)

In [6]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
df.head()

In [8]:
df.describe()

# 1. Univariate Analysis


In [9]:
# column - State Name
sns.countplot(x=df['State Name'])
plt.xticks(rotation=90)
plt.show()

In [10]:
df['State Name'].value_counts()

In [11]:
# Column - RICE AREA (1000 ha)
sns.histplot(x=df['RICE AREA (1000 ha)'],kde=True)

In [12]:
sns.boxplot(x=df['RICE AREA (1000 ha)'])

In [13]:
# Right Skewed
df['RICE AREA (1000 ha)'].skew()

In [14]:
# Column - RICE PRODUCTION (1000 tons)
sns.histplot(x=df['RICE PRODUCTION (1000 tons)'],kde=True)

In [15]:
sns.boxplot(x=df['RICE PRODUCTION (1000 tons)'])

In [16]:
# right skewed
df['RICE PRODUCTION (1000 tons)'].skew()

In [17]:
# Column - RICE YIELD (Kg per ha)
sns.histplot(x=df['RICE YIELD (Kg per ha)'],kde=True)

In [18]:
sns.boxplot(x=df['RICE YIELD (Kg per ha)'])

In [19]:
# Left skewed
df['RICE YIELD (Kg per ha)'].skew()

# 2. Bivariate Analysis


In [None]:
# Now let's analyze the data geographically
state_metrics = df_geo.groupby('State Name').agg({
    'RICE AREA (1000 ha)': ['sum', 'mean'],
    'RICE PRODUCTION (1000 tons)': ['sum', 'mean'],
    'RICE YIELD (Kg per ha)': ['mean', 'std'],
    'Year': 'count'
}).round(2)

state_metrics.columns = ['_'.join(col).strip() for col in state_metrics.columns]
state_metrics = state_metrics.reset_index()

state_metrics_geo = state_metrics.merge(
    state_coords_df[['State Name', 'Latitude', 'Longitude']], 
    on='State Name', how='left'
)

print("State-wise aggregated metrics:")
print(state_metrics_geo.head())


In [None]:
# Let's create some beautiful interactive maps to visualize our data
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Average Rice Yield (Kg/ha)', 'Total Rice Area (1000 ha)', 
                   'Total Rice Production (1000 tons)', 'Number of Districts'),
    specs=[[{"type": "choropleth"}, {"type": "choropleth"}],
           [{"type": "choropleth"}, {"type": "choropleth"}]]
)

fig.add_trace(
    go.Choropleth(
        locations=state_metrics_geo['State Name'],
        z=state_metrics_geo['RICE YIELD (Kg per ha)_mean'],
        locationmode='country names',
        colorscale='Viridis',
        colorbar=dict(title="Yield (Kg/ha)"),
        hovertemplate='<b>%{location}</b><br>Avg Yield: %{z:.0f} Kg/ha<extra></extra>'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Choropleth(
        locations=state_metrics_geo['State Name'],
        z=state_metrics_geo['RICE AREA (1000 ha)_sum'],
        locationmode='country names',
        colorscale='Blues',
        colorbar=dict(title="Area (1000 ha)"),
        hovertemplate='<b>%{location}</b><br>Total Area: %{z:.0f} 1000 ha<extra></extra>'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Choropleth(
        locations=state_metrics_geo['State Name'],
        z=state_metrics_geo['RICE PRODUCTION (1000 tons)_sum'],
        locationmode='country names',
        colorscale='Reds',
        colorbar=dict(title="Production (1000 tons)"),
        hovertemplate='<b>%{location}</b><br>Total Production: %{z:.0f} 1000 tons<extra></extra>'
    ),
    row=2, col=1
)

fig.add_trace(
    go.Choropleth(
        locations=state_metrics_geo['State Name'],
        z=state_metrics_geo['Year_count'],
        locationmode='country names',
        colorscale='Oranges',
        colorbar=dict(title="Districts Count"),
        hovertemplate='<b>%{location}</b><br>Districts: %{z}<extra></extra>'
    ),
    row=2, col=2
)

fig.update_layout(
    title_text="Rice Production Analysis - State-wise Distribution",
    title_x=0.5,
    height=800,
    showlegend=False,
    geo=dict(
        scope='asia',
        showland=True,
        landcolor='lightgray',
        showocean=True,
        oceancolor='lightblue',
        center=dict(lat=20, lon=80),
        projection_scale=2,
        lonaxis_range=[65, 95],
        lataxis_range=[5, 35]
    )
)

fig.show()


In [None]:
# Here's a cool bubble map where bubble size shows production and color shows yield
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode='country names',
    lon=state_metrics_geo['Longitude'],
    lat=state_metrics_geo['Latitude'],
    text=state_metrics_geo['State Name'],
    mode='markers',
    marker=dict(
        size=state_metrics_geo['RICE PRODUCTION (1000 tons)_sum'] / 50,
        color=state_metrics_geo['RICE YIELD (Kg per ha)_mean'],
        colorscale='Viridis',
        colorbar=dict(title="Average Yield (Kg/ha)"),
        sizemode='diameter',
        sizemin=5,
        sizemax=50,
        line=dict(width=2, color='white'),
        opacity=0.8
    ),
    hovertemplate='<b>%{text}</b><br>' +
                  'Production: %{marker.size:.0f} 1000 tons<br>' +
                  'Yield: %{marker.color:.0f} Kg/ha<br>' +
                  '<extra></extra>',
    name='Rice Production'
))

fig.update_layout(
    title='Rice Production & Yield Analysis - Interactive Bubble Map',
    geo=dict(
        scope='asia',
        showland=True,
        landcolor='lightgray',
        showocean=True,
        oceancolor='lightblue',
        showlakes=True,
        lakecolor='lightblue',
        showrivers=True,
        rivercolor='lightblue',
        center=dict(lat=20, lon=80),
        projection_scale=2,
        lonaxis_range=[65, 95],
        lataxis_range=[5, 35]
    ),
    height=600
)

fig.show()


In [None]:
# Let's create some heatmaps to see patterns across states and years
yield_heatmap_data = df_geo.pivot_table(
    values='RICE YIELD (Kg per ha)', 
    index='State Name', 
    columns='Year', 
    aggfunc='mean'
).fillna(0)

fig, axes = plt.subplots(2, 2, figsize=(20, 16))

sns.heatmap(yield_heatmap_data, 
            annot=True, 
            fmt='.0f', 
            cmap='YlOrRd',
            ax=axes[0,0],
            cbar_kws={'label': 'Yield (Kg/ha)'})
axes[0,0].set_title('Rice Yield Heatmap by State and Year', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('State')

production_heatmap_data = df_geo.pivot_table(
    values='RICE PRODUCTION (1000 tons)', 
    index='State Name', 
    columns='Year', 
    aggfunc='sum'
).fillna(0)

sns.heatmap(production_heatmap_data, 
            annot=True, 
            fmt='.0f', 
            cmap='Blues',
            ax=axes[0,1],
            cbar_kws={'label': 'Production (1000 tons)'})
axes[0,1].set_title('Rice Production Heatmap by State and Year', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('State')

area_heatmap_data = df_geo.pivot_table(
    values='RICE AREA (1000 ha)', 
    index='State Name', 
    columns='Year', 
    aggfunc='sum'
).fillna(0)

sns.heatmap(area_heatmap_data, 
            annot=True, 
            fmt='.0f', 
            cmap='Greens',
            ax=axes[1,0],
            cbar_kws={'label': 'Area (1000 ha)'})
axes[1,0].set_title('Rice Area Heatmap by State and Year', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Year')
axes[1,0].set_ylabel('State')

correlation_data = df_geo[['RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 
                          'RICE YIELD (Kg per ha)', 'Year']].corr()
sns.heatmap(correlation_data, 
            annot=True, 
            fmt='.3f', 
            cmap='RdBu_r',
            center=0,
            ax=axes[1,1],
            cbar_kws={'label': 'Correlation Coefficient'})
axes[1,1].set_title('Correlation Heatmap of Rice Metrics', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Now let's create an interactive map using Folium
m = folium.Map(
    location=[20.5937, 78.9629],
    zoom_start=4,
    min_zoom=3,
    max_zoom=10,
    tiles='OpenStreetMap'
)

for idx, row in state_metrics_geo.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        popup_text = f"""
        <b>{row['State Name']}</b><br>
        <b>Average Yield:</b> {row['RICE YIELD (Kg per ha)_mean']:.0f} Kg/ha<br>
        <b>Total Area:</b> {row['RICE AREA (1000 ha)_sum']:.0f} 1000 ha<br>
        <b>Total Production:</b> {row['RICE PRODUCTION (1000 tons)_sum']:.0f} 1000 tons<br>
        <b>Districts:</b> {row['Year_count']}<br>
        <b>Yield Std Dev:</b> {row['RICE YIELD (Kg per ha)_std']:.0f} Kg/ha
        """
        
        if row['RICE YIELD (Kg per ha)_mean'] > 2500:
            color = 'green'
        elif row['RICE YIELD (Kg per ha)_mean'] > 2000:
            color = 'orange'
        else:
            color = 'red'
        
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=row['RICE PRODUCTION (1000 tons)_sum'] / 100,
            popup=folium.Popup(popup_text, max_width=300),
            color='black',
            weight=2,
            fillColor=color,
            fillOpacity=0.7,
            tooltip=f"{row['State Name']}: {row['RICE YIELD (Kg per ha)_mean']:.0f} Kg/ha"
        ).add_to(m)

legend_html = '''
<div style="position: fixed; 
     bottom: 50px; left: 50px; width: 200px; height: 90px; 
     background-color: white; border:2px solid grey; z-index:9999; 
     font-size:14px; padding: 10px">
<p><b>Yield Legend:</b></p>
<p><i class="fa fa-circle" style="color:green"></i> High Yield (>2500 Kg/ha)</p>
<p><i class="fa fa-circle" style="color:orange"></i> Medium Yield (2000-2500 Kg/ha)</p>
<p><i class="fa fa-circle" style="color:red"></i> Low Yield (<2000 Kg/ha)</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

print("Interactive Folium Map created!")
print("Note: This map shows state-wise rice production data with:")
print("- Circle size represents total production")
print("- Color represents average yield")
print("- Click on markers for detailed information")
m


In [None]:
# Let's see how yield changes over time with an animated map
fig = px.choropleth(
    df_geo, 
    locations="State Name",
    locationmode='country names',
    color="RICE YIELD (Kg per ha)",
    animation_frame="Year",
    color_continuous_scale="Viridis",
    range_color=(df_geo['RICE YIELD (Kg per ha)'].min(), df_geo['RICE YIELD (Kg per ha)'].max()),
    title="Rice Yield Evolution Across Indian States (2010-2017)",
    labels={'RICE YIELD (Kg per ha)': 'Yield (Kg/ha)'}
)

fig.update_layout(
    geo=dict(
        scope='asia',
        center=dict(lat=20, lon=80),
        projection_scale=2,
        lonaxis_range=[65, 95],
        lataxis_range=[5, 35]
    ),
    height=600
)

fig.show()

# Let's look at yield trends for the top producing states
top_states = df_geo.groupby('State Name')['RICE PRODUCTION (1000 tons)'].sum().nlargest(5).index

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for i, state in enumerate(top_states[:4]):
    state_data = df_geo[df_geo['State Name'] == state].groupby('Year').agg({
        'RICE YIELD (Kg per ha)': 'mean',
        'RICE PRODUCTION (1000 tons)': 'sum',
        'RICE AREA (1000 ha)': 'sum'
    }).reset_index()
    
    axes[i].plot(state_data['Year'], state_data['RICE YIELD (Kg per ha)'], 
                marker='o', linewidth=2, markersize=6, label='Yield')
    axes[i].set_title(f'{state} - Yield Trend', fontweight='bold')
    axes[i].set_xlabel('Year')
    axes[i].set_ylabel('Yield (Kg/ha)')
    axes[i].grid(True, alpha=0.3)
    axes[i].legend()

plt.tight_layout()
plt.show()


In [None]:
# Let's summarize what we found from our geographical analysis
print("="*80)
print("GEOGRAPHICAL ANALYSIS SUMMARY")
print("="*80)

print("\n🏆 TOP PERFORMING STATES:")
print("-" * 40)

top_yield_states = state_metrics_geo.nlargest(5, 'RICE YIELD (Kg per ha)_mean')
print(f"\n📈 Highest Average Yield (Kg/ha):")
for idx, row in top_yield_states.iterrows():
    print(f"   {row['State Name']}: {row['RICE YIELD (Kg per ha)_mean']:.0f} Kg/ha")

top_production_states = state_metrics_geo.nlargest(5, 'RICE PRODUCTION (1000 tons)_sum')
print(f"\n🌾 Highest Total Production (1000 tons):")
for idx, row in top_production_states.iterrows():
    print(f"   {row['State Name']}: {row['RICE PRODUCTION (1000 tons)_sum']:.0f} 1000 tons")

top_area_states = state_metrics_geo.nlargest(5, 'RICE AREA (1000 ha)_sum')
print(f"\n🗺️  Largest Cultivation Area (1000 ha):")
for idx, row in top_area_states.iterrows():
    print(f"   {row['State Name']}: {row['RICE AREA (1000 ha)_sum']:.0f} 1000 ha")

print(f"\n🌍 GEOGRAPHICAL PATTERNS:")
print("-" * 40)

north_states = ['Punjab', 'Haryana', 'Himachal Pradesh', 'Uttar Pradesh', 'Uttarakhand']
south_states = ['Tamil Nadu', 'Karnataka', 'Kerala', 'Andhra Pradesh', 'Telangana']
east_states = ['West Bengal', 'Odisha', 'Bihar', 'Jharkhand', 'Assam']
west_states = ['Maharashtra', 'Gujarat', 'Rajasthan', 'Madhya Pradesh']
central_states = ['Chhattisgarh', 'Madhya Pradesh']

regions = {
    'North': north_states,
    'South': south_states, 
    'East': east_states,
    'West': west_states,
    'Central': central_states
}

for region, states in regions.items():
    region_data = state_metrics_geo[state_metrics_geo['State Name'].isin(states)]
    if not region_data.empty:
        avg_yield = region_data['RICE YIELD (Kg per ha)_mean'].mean()
        total_production = region_data['RICE PRODUCTION (1000 tons)_sum'].sum()
        print(f"   {region} India: Avg Yield = {avg_yield:.0f} Kg/ha, Total Production = {total_production:.0f} 1000 tons")

print(f"\n📊 YIELD VARIABILITY ANALYSIS:")
print("-" * 40)
high_variability = state_metrics_geo.nlargest(5, 'RICE YIELD (Kg per ha)_std')
print("States with highest yield variability (std dev):")
for idx, row in high_variability.iterrows():
    print(f"   {row['State Name']}: {row['RICE YIELD (Kg per ha)_std']:.0f} Kg/ha std dev")

state_metrics_geo['Efficiency'] = state_metrics_geo['RICE PRODUCTION (1000 tons)_sum'] / state_metrics_geo['RICE AREA (1000 ha)_sum']
most_efficient = state_metrics_geo.nlargest(5, 'Efficiency')
print(f"\n⚡ MOST EFFICIENT STATES (Production/Area ratio):")
for idx, row in most_efficient.iterrows():
    print(f"   {row['State Name']}: {row['Efficiency']:.2f} tons/ha")

print("\n" + "="*80)
print("GEOGRAPHICAL ANALYSIS COMPLETED!")
print("="*80)


In [None]:
# Just a quick note about the packages we used
print("📦 Required packages for geo-visualizations:")
print("-" * 50)
print("pip install plotly")
print("pip install folium")
print("pip install --upgrade matplotlib seaborn")
print("\n💡 Note: If you encounter any import errors, install the missing packages using the commands above.")
print("\n🎯 Geo-visualization features added:")
print("✅ Interactive choropleth maps")
print("✅ Scatter plot maps with bubble sizes")
print("✅ Geographical heatmaps")
print("✅ Interactive Folium maps")
print("✅ Temporal geographical analysis")
print("✅ Comprehensive geographical insights")
print("\n🚀 Your EDA notebook now includes comprehensive geographical analysis!")


In [20]:
#  State Name vs RICE AREA (1000 ha)
sns.barplot(x=df['State Name'],y=df['RICE AREA (1000 ha)'])
plt.xticks(rotation=90)
plt.show()