### **DATA CLEANING AND FEATURE ENGINEERING**

**Default Setup**

In [35]:
from dotenv import load_dotenv
import os

load_dotenv()
project_root= os.getenv("PROJECT_ROOT")
os.chdir(project_root)
# Verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\USER\Desktop\cropYield


**Importing the Required Modules**

In [36]:
from src.data.dataset_loader import load_data
from src.data.data_cleaning import clean_data
from src.data.feature_engineering import engineer_features
from src.data.data_splitting import split_data

In [37]:
# Build the path to the CSV file
data_path = os.path.join('data', 'raw', 'crop_yield_data.csv')

**Data Loading**

In [38]:
# Use the load_data function to load the data
crop_data = load_data(data_path)

print("Head data")
# Display the first few rows of the loaded data
print(crop_data.head())

print("Tail data")
# Display the last few rows of the loaded data
print(crop_data.tail())

Head data
   rainfall_mm  soil_quality_index  farm_size_hectares  sunlight_hours  \
0         1626                   9                 636              11   
1         1959                   9                  73              11   
2         1360                   1                 352               5   
3         1794                   2                 948               7   
4         1630                   5                 884               5   

   fertilizer_kg  crop_yield  
0           1006         404  
1            112         115  
2            702         231  
3            299         537  
4           2733         554  
Tail data
      rainfall_mm  soil_quality_index  farm_size_hectares  sunlight_hours  \
2995         1483                   9                  78              12   
2996          804                   9                 481               4   
2997          870                   1                 113               7   
2998         1352                   6    

In [39]:
#checking the total rows and columns
print(f"Total Rows {crop_data.shape[0]}\nTotal Columns: {crop_data.shape[1]}")


Total Rows 3000
Total Columns: 6


**Inferences**
****
**. There is a total of 3000 rows (instances) and 6 Features (Columns)**

In [40]:
#statistical structure
crop_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   rainfall_mm         3000 non-null   int64
 1   soil_quality_index  3000 non-null   int64
 2   farm_size_hectares  3000 non-null   int64
 3   sunlight_hours      3000 non-null   int64
 4   fertilizer_kg       3000 non-null   int64
 5   crop_yield          3000 non-null   int64
dtypes: int64(6)
memory usage: 140.8 KB


**Inferences**
****
**. All Features (Columns) in the data are Numerical values**

In [41]:
#statistical structure
crop_data.describe()

Unnamed: 0,rainfall_mm,soil_quality_index,farm_size_hectares,sunlight_hours,fertilizer_kg,crop_yield
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1263.095,5.506667,498.801,7.995333,1549.450333,328.099
std,432.371756,2.855172,287.122742,2.621501,814.326919,145.036503
min,500.0,1.0,10.0,4.0,100.0,46.0
25%,896.0,3.0,242.0,6.0,869.75,199.0
50%,1277.0,6.0,505.0,8.0,1542.0,332.0
75%,1636.0,8.0,741.0,10.0,2225.0,455.0
max,2000.0,10.0,1000.0,12.0,3000.0,628.0


In [42]:
#Checking for missing values
crop_data.isna().sum()

rainfall_mm           0
soil_quality_index    0
farm_size_hectares    0
sunlight_hours        0
fertilizer_kg         0
crop_yield            0
dtype: int64

**Inferences**
****
**. There are no missng values**
****
**. We will still perform data cleaning to remove outliers if they exists**

In [43]:
#performing data cleaning to remove outliers
crop_data_cleaned= clean_data(crop_data)
crop_data_cleaned.shape

(3000, 6)

In [44]:
# saving the cleaned data
# Build the path to the CSV file
data_path = os.path.join('data', 'processed', 'cleaned_crop_yield_data.csv')
crop_data.to_csv(data_path, index=False)

**Inferences**
****
****. Guess there are no outliers the datset size still remains 3000 rows (instances)****

**Feature Engineering**

In [45]:
#performing feature engineering to scale the data
final_crop_data = engineer_features(crop_data_cleaned)

In [46]:
# saving the cleaned data
# Build the path to the CSV file
data_path = os.path.join('data', 'processed', 'processed_crop_yield_data.csv')
final_crop_data.to_csv(data_path, index=False)

**Data Splitting**

In [47]:
split_data(
    input_data_path=data_path,
    output_train_path='data/train/train.csv',
    output_test_path='data/test/test.csv',
    target_column='crop_yield',
    seed=42,
    test_size=0.2,
)

Training data saved to data/train/train.csv
Testing data saved to data/test/test.csv
