In [5]:
import pandas as pd

**Loading and Exploring the dataset**

In [11]:
file_path = 'data/raw/agri_app_base_dataset.xlsx'
data_weather_soil = pd.read_excel(file_path)

# Display the first few rows
print(data_weather_soil.info())
print(data_weather_soil.head(10))
print(data_weather_soil.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12803 entries, 0 to 12802
Columns: 107 entries, Dist Code to Autumn OCT-DEC WINDSPEED (Meter per second)
dtypes: float64(102), int64(3), object(2)
memory usage: 10.5+ MB
None
   Dist Code  Year  State Code    State Name Dist Name  RICE AREA (1000 ha)  \
0          1  1990          14  Chhattisgarh      Durg           397.899994   
1          1  1991          14  Chhattisgarh      Durg           393.200012   
2          1  1992          14  Chhattisgarh      Durg           398.399994   
3          1  1993          14  Chhattisgarh      Durg           410.200012   
4          1  1994          14  Chhattisgarh      Durg           430.100006   
5          1  1995          14  Chhattisgarh      Durg           424.000000   
6          1  1996          14  Chhattisgarh      Durg           407.100006   
7          1  1997          14  Chhattisgarh      Durg           432.799988   
8          1  1998          14  Chhattisgarh      Durg          

In [12]:
data_weather_soil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12803 entries, 0 to 12802
Columns: 107 entries, Dist Code to Autumn OCT-DEC WINDSPEED (Meter per second)
dtypes: float64(102), int64(3), object(2)
memory usage: 10.5+ MB


In [18]:
# Check for missing values

print("\nMissing Values:")
print(data_weather_soil.isnull().sum())



Missing Values:
Dist Code                                      0
Year                                           0
State Code                                     0
State Name                                     0
Dist Name                                      0
                                              ..
DEC WINDSPEED (Meter per second)               0
Winter JAN-FEB WINDSPEED (Meter per second)    0
Summer MAR-MAY WINDSPEED (Meter per second)    0
Rainy JUN-SEP WINDSPEED (Meter per second)     0
Autumn OCT-DEC WINDSPEED (Meter per second)    0
Length: 107, dtype: int64


In [14]:
# Find columns with missing values
missing_values = data_weather_soil.isnull().sum()

# Filter columns that have missing values
missing_columns = missing_values[missing_values > 0]
print("Columns with Missing Values:")
print(missing_columns)

Columns with Missing Values:
RICE AREA (1000 ha)                             52
RICE PRODUCTION (1000 tons)                     50
RICE YIELD (Kg per ha)                          52
PEARL MILLET AREA (1000 ha)                    688
PEARL MILLET PRODUCTION (1000 tons)            684
                                              ... 
DEC WINDSPEED (Meter per second)               697
Winter JAN-FEB WINDSPEED (Meter per second)    697
Summer MAR-MAY WINDSPEED (Meter per second)    697
Rainy JUN-SEP WINDSPEED (Meter per second)     697
Autumn OCT-DEC WINDSPEED (Meter per second)    697
Length: 102, dtype: int64


In [15]:
# Identify numerical and categorical columns
numerical_cols = data_weather_soil.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data_weather_soil.select_dtypes(include=['object']).columns

# Separate columns with missing values into numerical and categorical
missing_numerical_cols = [col for col in numerical_cols if col in missing_columns.index]
missing_categorical_cols = [col for col in categorical_cols if col in missing_columns.index]

print("\nNumerical Columns with Missing Values:")
print(missing_numerical_cols)

print("\nCategorical Columns with Missing Values:")
print(missing_categorical_cols)


Numerical Columns with Missing Values:
['RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)', 'PEARL MILLET AREA (1000 ha)', 'PEARL MILLET PRODUCTION (1000 tons)', 'PEARL MILLET YIELD (Kg per ha)', 'CHICKPEA AREA (1000 ha)', 'CHICKPEA PRODUCTION (1000 tons)', 'CHICKPEA YIELD (Kg per ha)', 'GROUNDNUT AREA (1000 ha)', 'GROUNDNUT PRODUCTION (1000 tons)', 'GROUNDNUT YIELD (Kg per ha)', 'SUGARCANE AREA (1000 ha)', 'SUGARCANE PRODUCTION (1000 tons)', 'SUGARCANE YIELD (Kg per ha)', 'GROSS CROPPED AREA (1000 ha)', 'NITROGEN CONSUMPTION (tons)', 'PHOSPHATE CONSUMPTION (tons)', 'POTASH CONSUMPTION (tons)', 'TOTAL FERTILISER CONSUMPTION (tons)', 'TOTAL AGRICULTURAL LABOUR POPULATION (1000 Number)', 'GROSS IRRIGATED AREA (1000 ha)', 'JANUARY MAXIMUM TEMPERATURE (Centigrate)', 'FEBRUARY MAXIMUM TEMPERATURE (Centigrate)', 'MARCH MAXIMUM TEMPERATURE (Centigrate)', 'APRIL MAXIMUM TEMPERATURE (Centigrate)', 'MAY MAXIMUM TEMPERATURE (Centigrate)', 'JUNE MAXIMUM TEMPERATURE (Ce

In [16]:
# Fill missing numerical values with the mean
for col in missing_numerical_cols:
    data_weather_soil[col] = data[col].fillna(data[col].mean())
    print(f"Filled missing values in numerical column '{col}' with mean: {data[col].mean()}")


# Check for any remaining missing values
remaining_missing = data_weather_soil.isnull().sum().sum()
if remaining_missing == 0:
    print("\nAll missing values have been handled.")
else:
    print(f"\nThere are still {remaining_missing} missing values remaining in the dataset.")



Filled missing values in numerical column 'RICE AREA (1000 ha)' with mean: 84.99034503151151
Filled missing values in numerical column 'RICE PRODUCTION (1000 tons)' with mean: 176.83728531729054
Filled missing values in numerical column 'RICE YIELD (Kg per ha)' with mean: 1808.7009646302251
Filled missing values in numerical column 'PEARL MILLET AREA (1000 ha)' with mean: 19.946713179049016
Filled missing values in numerical column 'PEARL MILLET PRODUCTION (1000 tons)' with mean: 17.231978708262144
Filled missing values in numerical column 'PEARL MILLET YIELD (Kg per ha)' with mean: 619.3305266633647
Filled missing values in numerical column 'CHICKPEA AREA (1000 ha)' with mean: 14.84228212201273
Filled missing values in numerical column 'CHICKPEA PRODUCTION (1000 tons)' with mean: 12.173475906547562
Filled missing values in numerical column 'CHICKPEA YIELD (Kg per ha)' with mean: 673.0615288023859
Filled missing values in numerical column 'GROUNDNUT AREA (1000 ha)' with mean: 13.867819

In [17]:
data_weather_soil.describe()

Unnamed: 0,Dist Code,Year,State Code,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),PEARL MILLET AREA (1000 ha),PEARL MILLET PRODUCTION (1000 tons),PEARL MILLET YIELD (Kg per ha),CHICKPEA AREA (1000 ha),...,JULY WINDSPEED (Meter per second),AUG WINDSPEED (Meter per second),SEPT WINDSPEED (Meter per second),OCT WINDSPEED (Meter per second),NOV WINDSPEED (Meter per second),DEC WINDSPEED (Meter per second),Winter JAN-FEB WINDSPEED (Meter per second),Summer MAR-MAY WINDSPEED (Meter per second),Rainy JUN-SEP WINDSPEED (Meter per second),Autumn OCT-DEC WINDSPEED (Meter per second)
count,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,...,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0,12803.0
mean,496.803171,2003.263298,9.536046,84.990345,176.837285,1808.700965,19.946713,17.231979,619.330527,14.842282,...,2.025649,1.658908,1.373259,0.975569,0.864937,0.879663,1.176386,1.688524,1.80176,0.906723
std,434.593477,7.378508,5.065391,100.211179,246.699351,1046.462245,68.690323,51.835255,687.685599,33.226472,...,0.756553,0.664336,0.561136,0.402484,0.456622,0.494653,0.422095,0.387907,0.619899,0.428017
min,1.0,1990.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.58,0.343,0.321,0.21,0.013,0.127,0.267,0.681,0.5525,0.171
25%,126.0,1997.0,6.0,11.065,14.965,1061.0,0.0,0.0,0.0,0.05,...,1.435,1.163,0.946,0.6905,0.548,0.544,0.8725,1.412667,1.316,0.604333
50%,507.0,2004.0,10.0,57.689999,95.650002,1757.0,0.1,0.09,516.0,1.0,...,1.88,1.528,1.28,0.897,0.746,0.748,1.0905,1.659667,1.72175,0.793333
75%,823.0,2010.0,13.0,120.25,237.0,2456.0,9.6,8.38,1005.5,12.25,...,2.451,2.011,1.669,1.144,1.0315,1.043,1.3785,1.910667,2.157125,1.073667
max,2060.0,2015.0,20.0,918.599976,2145.290039,9750.0,1032.51001,826.820007,6316.0,545.289978,...,5.608,4.666,4.092,3.403,3.727,4.269,3.8835,3.693333,4.643,3.343667


**Exploring multiple crops produce dataset**

In [None]:
file_path = 'data/raw/district_crop_produce.csv'
data_crop_produce = pd.read_csv(file_path)

# Display the first few rows
print(data_crop_produce.info())
print(data_crop_produce.head(10))
print(data_crop_produce.shape)
