## Data Processing

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("farmer_advisor_dataset.csv")

In [4]:
df.head()

Unnamed: 0,Farm_ID,Soil_pH,Soil_Moisture,Temperature_C,Rainfall_mm,Crop_Type,Fertilizer_Usage_kg,Pesticide_Usage_kg,Crop_Yield_ton,Sustainability_Score
0,1,7.073643,49.145359,26.668157,227.890912,Wheat,131.692844,2.958215,1.57692,51.913649
1,2,6.236931,21.496115,29.325342,244.017493,Soybean,136.370492,19.20477,3.824686,47.159077
2,3,5.922335,19.469042,17.666414,141.110521,Corn,99.72521,11.041066,1.133198,50.148418
3,4,6.84512,27.974234,17.188722,156.785663,Wheat,194.832396,8.806271,8.87054,89.764557
4,5,6.934171,33.637679,23.603899,77.859362,Corn,57.271267,3.747553,8.779317,51.033941


In [5]:
df.shape # IN order to understand the dimensions of the dataset

(10000, 10)

In [6]:
df.info() #to know what types of data are present

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Farm_ID               10000 non-null  int64  
 1   Soil_pH               10000 non-null  float64
 2   Soil_Moisture         10000 non-null  float64
 3   Temperature_C         10000 non-null  float64
 4   Rainfall_mm           10000 non-null  float64
 5   Crop_Type             10000 non-null  object 
 6   Fertilizer_Usage_kg   10000 non-null  float64
 7   Pesticide_Usage_kg    10000 non-null  float64
 8   Crop_Yield_ton        10000 non-null  float64
 9   Sustainability_Score  10000 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 781.4+ KB


In [7]:
df.isnull().sum()  # to know how many missing values are present

Farm_ID                 0
Soil_pH                 0
Soil_Moisture           0
Temperature_C           0
Rainfall_mm             0
Crop_Type               0
Fertilizer_Usage_kg     0
Pesticide_Usage_kg      0
Crop_Yield_ton          0
Sustainability_Score    0
dtype: int64

In [8]:
df.duplicated().sum() # to check for duplicate rows

np.int64(0)

In [9]:
df.describe() # to get statistical summary of numerical columns

Unnamed: 0,Farm_ID,Soil_pH,Soil_Moisture,Temperature_C,Rainfall_mm,Fertilizer_Usage_kg,Pesticide_Usage_kg,Crop_Yield_ton,Sustainability_Score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,6.499494,29.988655,25.027475,174.969854,125.212701,10.521074,5.489634,50.2132
std,2886.89568,0.574181,11.493376,5.769509,72.860989,43.132645,5.535558,2.608809,28.667146
min,1.0,5.500021,10.002907,15.000186,50.031967,50.007543,1.00137,1.000323,0.003672
25%,2500.75,6.003992,20.027802,20.078612,111.786631,87.945625,5.675684,3.218402,25.974568
50%,5000.5,6.49538,29.862527,24.955117,174.468002,125.188012,10.619785,5.490626,50.23421
75%,7500.25,6.993481,40.052369,30.053313,237.812507,162.619398,15.330758,7.740585,74.938267
max,10000.0,7.499762,49.994713,34.999673,299.986192,199.991631,19.999099,9.999638,99.994545


In [10]:
df.describe(include='object') # to get statistical summary of categorical columns

Unnamed: 0,Crop_Type
count,10000
unique,4
top,Soybean
freq,2559


In [11]:
df = pd.get_dummies(df, drop_first=True) # to convert categorical variables into dummy/indicator variables


In [12]:
# Compute Q1 and Q3
Q1 = df['Sustainability_Score'].quantile(0.25)
Q3 = df['Sustainability_Score'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print('lower_bound:', lower_bound)
print('upper_bound:', upper_bound)

# Identify outliers
outliers = df[(df['Sustainability_Score'] < lower_bound) | (df['Sustainability_Score'] > upper_bound)]
print("Number of outliers in 'Sustainability_Score':", outliers.shape[0])

lower_bound: -47.470981043895364
upper_bound: 148.38381590794924
Number of outliers in 'Sustainability_Score': 0
