In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Opening Csv File from local Directory

In [3]:
# Load the dataset - using relative path that works on your Mac
df = pd.read_csv(r'/Users/lavanyalnair/Desktop/study/GSSOC-25/AI-agriculture-yield-production/Datasets/Crops_data.csv')
import os # Optional Step
print("Current Notebook Directory:", os.getcwd())
print("Files in current directory:", os.listdir())


Current Notebook Directory: /Users/manjushwarkhairkar/AI-agriculture-yield-production/Notebooks
Files in current directory: ['04_Evaluation_Deployment.ipynb', '03_modelling.ipynb', '02_eda.ipynb', '.ipynb_checkpoints', '01_Data_preprocessing.ipynb', '02_eda_2.ipynb']


In [10]:
df = pd.read_csv('../Datasets/rice_data_outlier_removed.csv')
print("Done")

Done


## Making of rice_data_outlier_removed.csv

In [None]:
sns.boxplot(x=df['RICE YIELD (Kg per ha)'])
plt.title("Boxplot of Rice Yield")
plt.show()

In [None]:
q1 = df['RICE YIELD (Kg per ha)'].quantile(0.25)
q3 = df['RICE YIELD (Kg per ha)'].quantile(0.75)
IQR = q3-q1
min_range = q1 - (1.5*IQR)
max_range = q3 + (1.5*IQR)
new_df = df[(df['RICE YIELD (Kg per ha)']<=max_range) & (df['RICE YIELD (Kg per ha)']>min_range)]

In [None]:
sns.boxplot(x=new_df['RICE YIELD (Kg per ha)'])
plt.title("Updated Boxplot of Rice Yield")
plt.show()

In [None]:
# Save the processed rice data using relative path
new_df.to_csv(r'C:\Users\devin\AI-agriculture-yield-prediction\Datasets\rice_data_outlier_removed.csv', index=False)

## Basic Preprocessing Steps

## 1. How big is the data?

In [9]:
df.shape

(2469, 8)

## 2. How does the data look like?

In [12]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,Year,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),State_en
610,611,2013,Karnataka,Bidar,3.57,5.54,1551.82,8
1520,1535,2013,Uttar Pradesh,Budaun,95.23,211.3,2218.83,17
1552,1567,2013,Uttar Pradesh,Rampur,136.97,306.95,2241.0,17


## 3. What is the data type of cols?

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2469 entries, 0 to 2468
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   2469 non-null   int64  
 1   Year                         2469 non-null   int64  
 2   State Name                   2469 non-null   object 
 3   Dist Name                    2469 non-null   object 
 4   RICE AREA (1000 ha)          2469 non-null   float64
 5   RICE PRODUCTION (1000 tons)  2469 non-null   float64
 6   RICE YIELD (Kg per ha)       2469 non-null   float64
 7   State_en                     2469 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 154.4+ KB


## 4. Are there any missing values?

In [14]:
df.isnull().sum()

Unnamed: 0                     0
Year                           0
State Name                     0
Dist Name                      0
RICE AREA (1000 ha)            0
RICE PRODUCTION (1000 tons)    0
RICE YIELD (Kg per ha)         0
State_en                       0
dtype: int64

## 5. How does the data look mathematically?

In [15]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),State_en
count,2469.0,2469.0,2469.0,2469.0,2469.0,2469.0
mean,1244.662211,2013.495342,136.422572,342.785715,2062.814403,10.764682
std,718.015201,2.294663,166.189858,450.308119,1101.125888,5.403128
min,0.0,2010.0,0.0,0.0,0.0,0.0
25%,618.0,2011.0,8.76,15.05,1340.97,7.0
50%,1248.0,2013.0,79.57,176.0,2174.74,11.0
75%,1866.0,2016.0,207.74,495.69,2730.82,16.0
max,2483.0,2017.0,1154.23,3215.01,4816.27,19.0


## 6. Are there duplicate values?

In [16]:
df.duplicated().sum()

np.int64(0)

## 7. Is there any correlation present?

In [22]:
df.select_dtypes(include='int').dropna().corr()


Unnamed: 0.1,Unnamed: 0,Year,State_en
Unnamed: 0,1.0,0.004611,0.178891
Year,0.004611,1.0,-0.000137
State_en,0.178891,-0.000137,1.0


## 8. Pandas Profiling Code

In [None]:
from ydata_profiling import ProfileReport

prof = ProfileReport(df)
prof.to_file("pandas_profiling")