In [2]:
import numpy as np
import pandas as pd

In [12]:
crop_df = pd.read_csv("./dataset/data.csv")
crop_df.rename(columns={'District_Name': 'District'}, inplace=True)
crop_df

Unnamed: 0,State_Name,District,Crop_Year,Season,Crop,Area,Production
0,Andhra Pradesh,ANANTAPUR,1997,Kharif,Arhar/Tur,21400,2600.0
1,Andhra Pradesh,ANANTAPUR,1997,Kharif,Bajra,1400,500.0
2,Andhra Pradesh,ANANTAPUR,1997,Kharif,Castor seed,1000,100.0
3,Andhra Pradesh,ANANTAPUR,1997,Kharif,Cotton(lint),7300,9400.0
4,Andhra Pradesh,ANANTAPUR,1997,Kharif,Dry chillies,3700,7100.0
...,...,...,...,...,...,...,...
9623,Andhra Pradesh,WEST GODAVARI,2014,Rabi,Tobacco,28046,89107.0
9624,Andhra Pradesh,WEST GODAVARI,2014,Rabi,Tomato,224,3035.0
9625,Andhra Pradesh,WEST GODAVARI,2014,Rabi,Urad,5418,4833.0
9626,Andhra Pradesh,WEST GODAVARI,2014,Whole Year,Coconut,21729,718991000.0


In [13]:
weather_df = pd.read_csv("./dataset/weather.csv")
weather_df

Unnamed: 0,Date,Rainfall_mm,Temp_Max_C,Temp_Min_C,District
0,1997-01-01,0.0,26.7,14.6,ANANTAPUR
1,1997-01-02,0.0,27.8,15.6,ANANTAPUR
2,1997-01-03,0.0,28.7,15.2,ANANTAPUR
3,1997-01-04,0.0,28.9,17.0,ANANTAPUR
4,1997-01-05,0.0,28.1,17.0,ANANTAPUR
...,...,...,...,...,...
85457,2014-12-27,0.0,28.5,18.0,WEST GODAVARI
85458,2014-12-28,0.0,28.2,20.3,WEST GODAVARI
85459,2014-12-29,18.2,24.9,19.5,WEST GODAVARI
85460,2014-12-30,13.4,27.5,20.4,WEST GODAVARI


In [14]:
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

In [15]:
weather_df['Year'] = weather_df['Date'].dt.year
weather_df['Month'] = weather_df['Date'].dt.month

In [16]:
# A simple function to map months to agricultural seasons in India
def get_season(month):
    if month >= 6 and month <= 10:
        return 'Kharif'
    elif month >= 11 or month <= 3:
        return 'Rabi'
    else:
        return 'Summer'

weather_df['Season'] = weather_df['Month'].apply(get_season)

In [17]:
aggregations = {
    'Rainfall_mm': 'sum',
    'Temp_Max_C': 'mean',
    'Temp_Min_C': 'mean'
}

seasonal_weather_df = weather_df.groupby(['District', 'Year', 'Season']).agg(aggregations).reset_index()

print("--- Aggregated Seasonal Weather Data ---")
print(seasonal_weather_df.head())

--- Aggregated Seasonal Weather Data ---
    District  Year  Season  Rainfall_mm  Temp_Max_C  Temp_Min_C
0  ANANTAPUR  1997  Kharif        229.6   33.111765   23.916340
1  ANANTAPUR  1997    Rabi         76.9   31.596689   20.336424
2  ANANTAPUR  1997  Summer         19.2   37.562295   25.431148
3  ANANTAPUR  1998  Kharif        522.2   31.715033   23.929412
4  ANANTAPUR  1998    Rabi         31.8   32.294040   20.737748


In [21]:
# Rename 'Crop_Year' in the crop data to 'Year' to match the weather data
crop_df.rename(columns={'Crop_Year': 'Year'}, inplace=True)

# Ensure the merge keys are in the same format (e.g., consistent casing)
# This is a good practice step.
crop_df['District'] = crop_df['District'].str.upper()
crop_df['Season'] = crop_df['Season'].str.strip() # Remove any leading/trailing whitespace
seasonal_weather_df['District'] = seasonal_weather_df['District'].str.upper()

# Now, merge the two dataframes on the common columns
final_df = pd.merge(crop_df, seasonal_weather_df, on=['District', 'Year', 'Season'], how='inner')
final_df.drop("State_Name", axis="columns", inplace=True)
final_df

Unnamed: 0,District,Year,Season,Crop,Area,Production,Rainfall_mm,Temp_Max_C,Temp_Min_C
0,ANANTAPUR,1997,Kharif,Arhar/Tur,21400,2600.0,229.6,33.111765,23.916340
1,ANANTAPUR,1997,Kharif,Bajra,1400,500.0,229.6,33.111765,23.916340
2,ANANTAPUR,1997,Kharif,Castor seed,1000,100.0,229.6,33.111765,23.916340
3,ANANTAPUR,1997,Kharif,Cotton(lint),7300,9400.0,229.6,33.111765,23.916340
4,ANANTAPUR,1997,Kharif,Dry chillies,3700,7100.0,229.6,33.111765,23.916340
...,...,...,...,...,...,...,...,...,...
7743,WEST GODAVARI,2014,Rabi,Sesamum,727,137.0,67.7,30.025166,20.817881
7744,WEST GODAVARI,2014,Rabi,Sunflower,335,323.0,67.7,30.025166,20.817881
7745,WEST GODAVARI,2014,Rabi,Tobacco,28046,89107.0,67.7,30.025166,20.817881
7746,WEST GODAVARI,2014,Rabi,Tomato,224,3035.0,67.7,30.025166,20.817881


In [22]:
final_df.to_csv('./dataset/final_data.csv', index=False)

print("\n--- Final Merged Dataset ---")
print(final_df.head())
print(f"\nSuccessfully created the final merged dataset with {len(final_df)} rows!")


--- Final Merged Dataset ---
    District  Year  Season          Crop   Area  Production  Rainfall_mm  \
0  ANANTAPUR  1997  Kharif     Arhar/Tur  21400      2600.0        229.6   
1  ANANTAPUR  1997  Kharif         Bajra   1400       500.0        229.6   
2  ANANTAPUR  1997  Kharif   Castor seed   1000       100.0        229.6   
3  ANANTAPUR  1997  Kharif  Cotton(lint)   7300      9400.0        229.6   
4  ANANTAPUR  1997  Kharif  Dry chillies   3700      7100.0        229.6   

   Temp_Max_C  Temp_Min_C  
0   33.111765    23.91634  
1   33.111765    23.91634  
2   33.111765    23.91634  
3   33.111765    23.91634  
4   33.111765    23.91634  

Successfully created the final merged dataset with 7748 rows!
