# Data Preprocessing

In [1]:
import pathlib
import pandas as pd

In [2]:
import sys

## Load Dataset

In [3]:
path = pathlib.Path()

In [4]:
data_path = path.absolute().parents[2].joinpath('data/2410_iPAGE_SoilData-241029.csv')

In [6]:
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print(f'File not found, ensure file path "{data_path}" exits')
    sys.exit(1)

## Explore Dataset

In [7]:
df.head()

Unnamed: 0,Area,Data Collection Year,soil group,Land class,knit (surface),pH,SOC (%),Nitrogen N (%),Potassium K (meq/100),Phosphorus P (ug/g),Sulfur S (ug/g),Boron B (ug/g),Zinc Zn (ug/g)
0,Mithpukur,2005,belab,high ground,Clay loam,5.0,1.27,0.08,0.15,19.6,37.7,0.26,0.86
1,Mithpukur,2005,belab,high ground,Clay loam,4.9,1.47,0.09,0.25,4.1,32.0,0.25,0.75
2,Mithpukur,2005,belab,high ground,Clay loam,4.6,1.07,0.05,0.09,13.3,13.5,0.27,0.95
3,Mithpukur,2005,belab,high ground,Clay loam,5.2,1.51,0.06,0.3,20.2,30.0,0.28,1.0
4,Mithpukur,2005,belab,high ground,Clay loam,5.3,1.08,0.11,0.17,20.5,27.8,0.3,1.04


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Area                   618 non-null    object 
 1   Data Collection Year   618 non-null    int64  
 2   soil group             618 non-null    object 
 3   Land class             618 non-null    object 
 4   knit (surface)         461 non-null    object 
 5   pH                     618 non-null    object 
 6   SOC (%)                618 non-null    float64
 7   Nitrogen N (%)         618 non-null    object 
 8   Potassium K (meq/100)  618 non-null    object 
 9   Phosphorus P (ug/g)    618 non-null    object 
 10  Sulfur S (ug/g)        618 non-null    object 
 11  Boron B (ug/g)         618 non-null    object 
 12  Zinc Zn (ug/g)         618 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 62.9+ KB


In [9]:
df.isnull().sum()

Area                       0
Data Collection Year       0
soil group                 0
Land class                 0
knit (surface)           157
pH                         0
SOC (%)                    0
Nitrogen N (%)             0
Potassium K (meq/100)      0
Phosphorus P (ug/g)        0
Sulfur S (ug/g)            0
Boron B (ug/g)             0
Zinc Zn (ug/g)             0
dtype: int64

### Data Quality Issue Discovered

- Incorrect datatypes for column attribute `pH`, `Nitrogen N (%)`, `Potassium K (meq/100)`, `Phosphorus P (ug/g)`, `Sulfur S (ug/g)` , `Boron B (ug/g)`, and `Zinc Zn (ug/g)`
- Column `knit (surface)` has 157 missing values

### Data Fix

In [18]:
numerical_cols = [
    'pH', 'SOC (%)', 'Nitrogen N (%)',
    'Potassium K (meq/100)', 'Phosphorus P (ug/g)',
    'Sulfur S (ug/g)', 'Boron B (ug/g)', 'Zinc Zn (ug/g)'
]


In [29]:
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [16]:
df.duplicated().sum()

np.int64(0)