In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Loading and Exploring the soil and weather dataset**

In [None]:
file_path = 'data/raw/agri_app_base_dataset.xlsx'
data_weather_soil = pd.read_excel(file_path)

# Display the first few rows
print(data_weather_soil.info())
print(data_weather_soil.head(10))
print(data_weather_soil.shape)


In [None]:
data_weather_soil.info()

**Handling missing values**

In [18]:
# Check for missing values

print("\nMissing Values:")
print(data_weather_soil.isnull().sum())



Missing Values:
Dist Code                                      0
Year                                           0
State Code                                     0
State Name                                     0
Dist Name                                      0
                                              ..
DEC WINDSPEED (Meter per second)               0
Winter JAN-FEB WINDSPEED (Meter per second)    0
Summer MAR-MAY WINDSPEED (Meter per second)    0
Rainy JUN-SEP WINDSPEED (Meter per second)     0
Autumn OCT-DEC WINDSPEED (Meter per second)    0
Length: 107, dtype: int64


In [14]:
# Find columns with missing values
missing_values = data_weather_soil.isnull().sum()

# Filter columns that have missing values
missing_columns = missing_values[missing_values > 0]
print("Columns with Missing Values:")
print(missing_columns)

Columns with Missing Values:
RICE AREA (1000 ha)                             52
RICE PRODUCTION (1000 tons)                     50
RICE YIELD (Kg per ha)                          52
PEARL MILLET AREA (1000 ha)                    688
PEARL MILLET PRODUCTION (1000 tons)            684
                                              ... 
DEC WINDSPEED (Meter per second)               697
Winter JAN-FEB WINDSPEED (Meter per second)    697
Summer MAR-MAY WINDSPEED (Meter per second)    697
Rainy JUN-SEP WINDSPEED (Meter per second)     697
Autumn OCT-DEC WINDSPEED (Meter per second)    697
Length: 102, dtype: int64


In [None]:
# Identify numerical and categorical columns
numerical_cols = data_weather_soil.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data_weather_soil.select_dtypes(include=['object']).columns

# Separate columns with missing values into numerical and categorical
missing_numerical_cols = [col for col in numerical_cols if col in missing_columns.index]
missing_categorical_cols = [col for col in categorical_cols if col in missing_columns.index]

print("\nNumerical Columns with Missing Values:")
print(missing_numerical_cols)

print("\nCategorical Columns with Missing Values:")
print(missing_categorical_cols)

In [None]:
# Fill missing numerical values with the mean
for col in missing_numerical_cols:
    data_weather_soil[col] = data[col].fillna(data[col].mean())
    print(f"Filled missing values in numerical column '{col}' with mean: {data[col].mean()}")


# Check for any remaining missing values
remaining_missing = data_weather_soil.isnull().sum().sum()
if remaining_missing == 0:
    print("\nAll missing values have been handled.")
else:
    print(f"\nThere are still {remaining_missing} missing values remaining in the dataset.")



**Handling outliers**

In [None]:
print("Summary statistics of numerical columns:")
print(data_weather_soil.describe())

In [None]:
# Visualize the outliers using boxplot for each numerical column
for col in data_weather_soil.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=data_weather_soil[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

In [None]:
#Detect and handle outliers using the IQR method
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)  # First quartile
    Q3 = data[column].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Apply IQR-based outlier removal for all numerical columns
for col in data_weather_soil.select_dtypes(include=['float64', 'int64']).columns:
    original_size = data_weather_soil.shape[0]
    data_weather_soil = remove_outliers_iqr(data_weather_soil, col)
    new_size = data_weather_soil.shape[0]
    print(f"Removed {original_size - new_size} outliers from {col}.")


In [26]:
cleaned_path = 'data/cleaned/cleaned_soil_weather_data.xlsx'
data_weather_soil.to_excel(cleaned_path, index=False)
print(f"Cleaned soil and weather data saved to {cleaned_path}")


Cleaned soil and weather data saved to data/cleaned/cleaned_soil_weather_data.xlsx


# **Loading & Exploring multiple crops produce dataset**

In [None]:
file_path = 'data/raw/district_crop_produce.csv'
data_crop_produce = pd.read_csv(file_path)

# Display the first few rows
print(data_crop_produce.info())
print(data_crop_produce.head(10))
print(data_crop_produce.shape)


In [None]:
# Check for missing values

print("\nMissing Values:")
print(data_crop_produce.isnull().sum())


In [None]:
print("Summary statistics of numerical columns:")
print(data_crop_produce.describe())

In [None]:
# Visualize the outliers using boxplot for each numerical column
for col in data_crop_produce.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=data_crop_produce[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

In [None]:
# Apply IQR-based outlier removal for all numerical columns
for col in data_crop_produce.select_dtypes(include=['float64', 'int64']).columns:
    original_size = data_crop_produce.shape[0]
    data_crop_produce = remove_outliers_iqr(data_crop_produce, col)
    new_size = data_crop_produce.shape[0]
    print(f"Removed {original_size - new_size} outliers from {col}.")


In [None]:
cleaned_path = 'data/cleaned/cleaned__weather_data.xlsx'
data_weather_soil.to_excel(cleaned_path, index=False)
print(f"Cleaned soil and weather data saved to {cleaned_path}")
