In [1]:
import pandas as pd

# Load the data from the provided CSV file
patients_data = pd.read_csv('patients.csv')

# Display the first few rows of the dataset to understand its structure
patients_data.head()


Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult
0,1001,Patient_1,20,Diabetes,167
1,1002,Patient_2,74,Diabetes,153
2,1003,Patient_3,40,Hypertension,196
3,1004,Patient_4,23,,157
4,1005,Patient_5,35,Diabetes,115


In [2]:
# 1. Data Cleaning:

# 1.1 Fill missing values in the Diagnosis column with "Unknown"
patients_data['Diagnosis'].fillna('Unknown', inplace=True)

# 1.2 Remove duplicate rows
patients_data.drop_duplicates(inplace=True)

# 1.3 Validate data types and handle any inconsistencies

# Ensure Age is a positive integer
patients_data['Age'] = patients_data['Age'].astype(int)
patients_data = patients_data[patients_data['Age'] > 0]

# Ensure LabResult is a positive integer
patients_data['LabResult'] = patients_data['LabResult'].astype(int)
patients_data = patients_data[patients_data['LabResult'] > 0]

# 1.4 Encoding for the categorical variables
diagnosis_encoding = {diagnosis: idx for idx, diagnosis in enumerate(patients_data['Diagnosis'].unique())}
patients_data['Diagnosis_Encoded'] = patients_data['Diagnosis'].map(diagnosis_encoding)

patients_data.head()


Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Unknown,157,2
4,1005,Patient_5,35,Diabetes,115,0
