# Data Preprocessing

## Import relevant libraries

In [4]:
import pandas as pd

## Load the data

In [6]:
# Load the data from the provided CSV files
patients_data = pd.read_csv('patients.csv')

# Display the first few rows of the dataset to understand its structure
patients_data.head()

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult
0,1001,Patient_1,20,Diabetes,167
1,1002,Patient_2,74,Diabetes,153
2,1003,Patient_3,40,Hypertension,196
3,1004,Patient_4,23,,157
4,1005,Patient_5,35,Diabetes,115


## Data Cleaning

### 1.1 Fill missing values in the Diagnosis column with "Unknown"

In [9]:
patients_data['Diagnosis'].fillna('Unknown', inplace=True)
patients_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  patients_data['Diagnosis'].fillna('Unknown', inplace=True)


Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult
0,1001,Patient_1,20,Diabetes,167
1,1002,Patient_2,74,Diabetes,153
2,1003,Patient_3,40,Hypertension,196
3,1004,Patient_4,23,Unknown,157
4,1005,Patient_5,35,Diabetes,115


### 1.2 Remove duplicate rows

In [11]:
patients_data.drop_duplicates()

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult
0,1001,Patient_1,20,Diabetes,167
1,1002,Patient_2,74,Diabetes,153
2,1003,Patient_3,40,Hypertension,196
3,1004,Patient_4,23,Unknown,157
4,1005,Patient_5,35,Diabetes,115
...,...,...,...,...,...
95,1096,Patient_96,40,Diabetes,96
96,1097,Patient_97,60,Diabetes,192
97,1098,Patient_98,45,Diabetes,88
98,1099,Patient_99,61,Common Cold,81


### 1.3 Validate data types and handle any inconsistencies

In [13]:
# Ensure Age is a positive integer
patients_data['Age'] = patients_data['Age'].astype(int)
patients_data = patients_data[patients_data['Age'] > 0]

In [14]:
# Ensure LabResult is a positive integer
patients_data['LabResult'] = patients_data['LabResult'].astype(int)
patients_data = patients_data[patients_data['LabResult'] > 0]

In [15]:
# Encoding for the categorical variables
diagnosis_encoding = {diagnosis: idx for idx, diagnosis in enumerate(patients_data['Diagnosis'].unique())}
patients_data['Diagnosis_Encoded'] = patients_data['Diagnosis'].map(diagnosis_encoding)

In [16]:
patients_data.head()

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Unknown,157,2
4,1005,Patient_5,35,Diabetes,115,0
