# 1_EDA_master.ipynb

Master EDA notebook. Imports shared helpers from `utils.py`. Run top-to-bottom.

Outputs saved to `eda_outputs/`. 

In [None]:
# Imports & config
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import read_csv_fallback, normalize_columns, ensure_month_features, ensure_season_features, detect_target_column, basic_checks

sns.set(style='whitegrid')
ROOT = Path.cwd()
CSV = ROOT / 'merged_final.csv'
OUT = ROOT / 'eda_outputs'
OUT.mkdir(exist_ok=True)

# Load
df = read_csv_fallback(CSV)
df = normalize_columns(df)
print('Loaded', CSV, 'shape:', df.shape)
print('\nBasic checks:')
print(basic_checks(df))


## Prepare month & season features (non-destructive)

In [None]:
df = ensure_month_features(df)
df = ensure_season_features(df)
print('Month_num sample:', df['Month_num'].dropna().unique()[:10])
print('Season_num sample:', df['Season_num'].dropna().unique()[:10])


## Target detection & summary

In [None]:
target = detect_target_column(df)
if target is None:
    raise KeyError('Target (soil moisture) not found')
print('Target column:', target)
print(df[target].describe())


## Distributions & skewness

In [None]:
import os
# Distribution
plt.figure(figsize=(8,4))
plt.hist(df[target].dropna(), bins=60)
plt.title('Target distribution')
plt.savefig(OUT/'target_distribution.png', dpi=150)
plt.show()

print('Skewness:', df[target].skew())
# Log1p preview
plt.figure(figsize=(8,4))
plt.hist(np.log1p(df[target].clip(lower=0)).dropna(), bins=60)
plt.title('Log1p target distribution')
plt.savefig(OUT/'target_log1p.png', dpi=150)
plt.show()


## Season-wise, Month-wise, State-wise analysis

In [None]:
# Season boxplot
plt.figure(figsize=(8,4))
order = ['Monsoon','Post-monsoon','Winter','Summer']
existing = [s for s in order if s in df['Season'].unique()]
import seaborn as sns
sns.boxplot(data=df, x='Season', y=target, order=existing)
plt.title('Season-wise target')
plt.savefig(OUT/'season_boxplot.png', dpi=150)
plt.show()

# Month trend by year
if 'Year' in df.columns:
    month_mean = df.groupby(['Month_num','Year'])[target].mean().reset_index()
    plt.figure(figsize=(10,5))
    sns.lineplot(data=month_mean, x='Month_num', y=target, hue='Year', marker='o')
    plt.xticks(ticks=range(1,13))
    plt.title('Monthly mean by year')
    plt.savefig(OUT/'monthly_by_year.png', dpi=150)
    plt.show()

# State mean
state_mean = df.groupby('state_name')[target].mean().sort_values()
plt.figure(figsize=(12,5))
state_mean.plot(kind='bar')
plt.title('Mean by state')
plt.savefig(OUT/'state_mean.png', dpi=150)
plt.show()


## Correlation & pairplot (sample)

In [None]:
num_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(10,8))
sns.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix')
plt.savefig(OUT/'correlation_matrix.png', dpi=150)
plt.show()

# pairplot sample
cols_for_pair = [c for c in ['Year','Month_num','Day', target] if c in df.columns]
if len(cols_for_pair) >= 2:
    sns.pairplot(df[cols_for_pair].dropna().sample(min(2000, len(df))))
    plt.savefig(OUT/'pairplot_sample.png', dpi=150)
    plt.show()


## Time-series overview (global)

In [None]:
if 'date' in df.columns:
    df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce')
if 'date_parsed' in df.columns and df['date_parsed'].notna().any():
    ts = df.set_index('date_parsed').sort_index()[target].resample('MS').mean()
    plt.figure(figsize=(12,4))
    ts.plot()
    plt.title('Monthly avg over time')
    plt.savefig(OUT/'timeseries_monthly_avg.png', dpi=150)
    plt.show()
else:
    print('No date_parsed available')


## Save small summaries

In [None]:
# save state and month summaries
state_mean.reset_index().to_csv(OUT/'state_mean_summary.csv', index=False)
if 'Month_num' in df.columns:
    df.groupby('Month_num')[target].mean().reset_index().to_csv(OUT/'month_mean_summary.csv', index=False)
print('Saved summaries to', OUT)
