In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and Inspect the Data
# Load the dataset
df = pd.read_csv('climate_action_data.csv')

# Preview the dataset
print(df.head())
print(df.info())
print(df.describe())
print(df.shape) # Rows and columns in the dataset

In [None]:
#  2. Clean the Dataset
# Replace 'error' strings with NaN
df.replace('error', np.nan, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Convert data types (try numeric conversion where possible)
df = df.apply(pd.to_numeric, errors='ignore')

# Check for missing values
print(df.isnull().sum())

# Handle missing values
# Fill missing numerical values with median (more robust than mean)
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Drop rows with too many missing values (if any remain)
df.dropna(thresh=len(df.columns) - 2, inplace=True)  # keep rows with at least 8/10 fields

# Final check
print(df.info())


In [None]:
# Exploratory Data Analysis (EDA)
print(df.describe())


In [None]:
# Histograms for numeric variables
df.select_dtypes(include=[np.number]).hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()



In [None]:
# Correlation heatmap

# Identify non-numeric columns that shouldn't be converted (e.g., sensor ID, crop type)
non_numeric_cols = ['sensor_id', 'crop_type']  # adjust based on your actual column names

# Convert only numeric-like columns to numeric, coercing errors into NaN
for col in df.columns:
    if col not in non_numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Variables influencing fertilizer recommendations
# Check for variables that might influence fertilizer recommendations
if 'Fertilizer_Recommended(kg/ha)' in df.columns:
    corr_with_fertilizer = df.corr()['Fertilizer_Recommended(kg/ha)'].sort_values(ascending=False)
    print("Top correlated features with Fertilizer_Recommended(kg/ha):")
    print(corr_with_fertilizer)


In [None]:
print(df[['Crop_Type','Soil_Moisture(%)']].head(10))
print(df['Soil_Moisture(%)'].isnull().sum())  # Number of missing values
print(df['Crop_Type'].isnull().sum())
print(df['Soil_Moisture(%)'].unique())



In [41]:
# Crop type with highest average soil moisture
if 'Crop_Type' in df.columns and 'Soil_Moisture(%)' in df.columns:
    top_crop = df.groupby('Crop_Type')['Soil_Moisture(%)'].mean().sort_values(ascending=False).idxmax()
    print(f"Crop type with highest average soil moisture: {top_crop}")


ValueError: attempt to get argmax of an empty sequence

In [34]:
# c. Irrigation recommendations for crops with temp > 30°C
if 'Temperature(C)' in df.columns and 'Soil_Moisture(%)' in df.columns and 'Crop_Type' in df.columns:
    hot_crops = df[df['Temperature(C)'] > 30]
    irrigation_recommendations = hot_crops.groupby('Crop_Type')['Soil_Moisture(%)'].mean().sort_values()

    print("Irrigation recommendation for high-temp crops (temp > 30°C):")
    print(irrigation_recommendations)

#     Recommendation:

# Crops experiencing average temps >30°C with low soil moisture need increased irrigation frequency.

# Consider drip irrigation systems for better efficiency.


Irrigation recommendation for high-temp crops (temp > 30°C):
Series([], Name: Soil_Moisture(%), dtype: float64)


In [42]:
# 5. Export Cleaned Dataset
df.to_csv('cleaned_precision_agriculture_data.csv', index=False)
