Assignment#1: Data Exploration, Cleaning, Normalization, Data Preprocessing and Analysis.

Submitted By: Hamesh Raj

Email: hm.raisingani@gmail.com

Date: 19-02-2024

### Part 1: Data Exploration and Cleaning

In [7]:

# 1. Read the "04_Data.csv" file using pandas
import pandas as pd

data = pd.read_csv("04_Data.csv")

In [8]:
# 2. Identify and analyze the data types of each column.
data_types = data.dtypes
print("Data Types:")
print(data_types)

Data Types:
Patient ID         object
Age                 int64
BMI               float64
Diagnosis          object
Blood Pressure     object
dtype: object


In [9]:

# 3. Find the number of missing values in each column.
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)


Missing Values:
Patient ID         0
Age                0
BMI               14
Diagnosis          9
Blood Pressure     9
dtype: int64


In [10]:
data["BMI"]

0     23.416480
1     30.539825
2     31.654859
3           NaN
4           NaN
5           NaN
6     20.154990
7     21.675982
8     31.285261
9     33.454012
10          NaN
11          NaN
12          NaN
13    31.369513
14          NaN
15          NaN
16          NaN
17    26.752736
18    27.885888
19    29.278378
20    23.506374
21          NaN
22    29.850859
23          NaN
24          NaN
25          NaN
26    23.628654
27    22.254053
28    28.550565
29          NaN
Name: BMI, dtype: float64

In [11]:
# 4. Clean the missing values or fill with mean:
mean_bmi = data['BMI'].mean()

# Replace missing BMI values with the mean BMI (excluding NaN).

data['BMI'].fillna(mean_bmi, inplace=True)
print(data['BMI'])

0     23.416480
1     30.539825
2     31.654859
3     27.203652
4     27.203652
5     27.203652
6     20.154990
7     21.675982
8     31.285261
9     33.454012
10    27.203652
11    27.203652
12    27.203652
13    31.369513
14    27.203652
15    27.203652
16    27.203652
17    26.752736
18    27.885888
19    29.278378
20    23.506374
21    27.203652
22    29.850859
23    27.203652
24    27.203652
25    27.203652
26    23.628654
27    22.254053
28    28.550565
29    27.203652
Name: BMI, dtype: float64


In [12]:
data["Blood Pressure"]

0     119/66
1     103/62
2      98/70
3     117/87
4        NaN
5     139/81
6     115/60
7     137/65
8     123/71
9        NaN
10    111/84
11    117/75
12       NaN
13    102/69
14     92/75
15       NaN
16    131/82
17       NaN
18    110/81
19       NaN
20    100/89
21       NaN
22    112/82
23     92/60
24     90/69
25    113/70
26    107/74
27       NaN
28       NaN
29    137/75
Name: Blood Pressure, dtype: object

In [13]:
# Impute missing blood pressure values using forward fill.
data['Blood Pressure'].fillna(method='ffill', inplace=True)
data['Blood Pressure']

  data['Blood Pressure'].fillna(method='ffill', inplace=True)


0     119/66
1     103/62
2      98/70
3     117/87
4     117/87
5     139/81
6     115/60
7     137/65
8     123/71
9     123/71
10    111/84
11    117/75
12    117/75
13    102/69
14     92/75
15     92/75
16    131/82
17    131/82
18    110/81
19    110/81
20    100/89
21    100/89
22    112/82
23     92/60
24     90/69
25    113/70
26    107/74
27    107/74
28    107/74
29    137/75
Name: Blood Pressure, dtype: object

In [14]:
# Handle missing diagnoses by dropping rows.
data.dropna(subset=['Diagnosis'], inplace=True)
data['Diagnosis']

2     Hypertension
3     Hypertension
4     Hypertension
5     Hypertension
6         Diabetes
10        Diabetes
11    Hypertension
12    Hypertension
13    Hypertension
14        Diabetes
16    Hypertension
17    Hypertension
18    Hypertension
19    Hypertension
20        Diabetes
21        Diabetes
24    Hypertension
25        Diabetes
26        Diabetes
28    Hypertension
29        Diabetes
Name: Diagnosis, dtype: object

### Part 2: Data Preprocessing and Analysis

In [15]:
# 1. Convert the "Blood Pressure" column into two separate numerical columns for systolic and diastolic pressure using string manipulation techniques.
data[['Systolic_BP', 'Diastolic_BP']] = data['Blood Pressure'].str.split('/', expand=True).astype(int)

data.head()

Unnamed: 0,Patient ID,Age,BMI,Diagnosis,Blood Pressure,Systolic_BP,Diastolic_BP
2,P0003,68,31.654859,Hypertension,98/70,98,70
3,P0004,22,27.203652,Hypertension,117/87,117,87
4,P0005,42,27.203652,Hypertension,117/87,117,87
5,P0006,29,27.203652,Hypertension,139/81,139,81
6,P0007,21,20.15499,Diabetes,115/60,115,60


In [16]:
# 2. Apply data normalization (min-max scaling) to "BMI" column.
#min_bmi = data['BMI'].min()
#max_bmi = data['BMI'].max()

min_bmi = min(data['BMI'])
max_bmi = max(data['BMI'])

data['Normalized_BMI'] = (data['BMI'] - min_bmi) / (max_bmi - min_bmi)
data['Normalized_BMI']

2     1.000000
3     0.612934
4     0.612934
5     0.612934
6     0.000000
10    0.612934
11    0.612934
12    0.612934
13    0.975187
14    0.612934
16    0.612934
17    0.573724
18    0.672260
19    0.793347
20    0.291428
21    0.612934
24    0.612934
25    0.612934
26    0.302061
28    0.730058
29    0.612934
Name: Normalized_BMI, dtype: float64

In [17]:
# 3. Create bins for the "Age" column and use one-hot encoding to transform it into categorical features.
bins = [20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['20-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
data['Age_Bin'] = pd.cut(data['Age'], bins=bins, labels=labels)
data = pd.get_dummies(data, columns=['Age_Bin'])
data.head()

Unnamed: 0,Patient ID,Age,BMI,Diagnosis,Blood Pressure,Systolic_BP,Diastolic_BP,Normalized_BMI,Age_Bin_20-30,Age_Bin_31-40,Age_Bin_41-50,Age_Bin_51-60,Age_Bin_61-70,Age_Bin_71-80,Age_Bin_81-90,Age_Bin_91-100
2,P0003,68,31.654859,Hypertension,98/70,98,70,1.0,False,False,False,False,True,False,False,False
3,P0004,22,27.203652,Hypertension,117/87,117,87,0.612934,True,False,False,False,False,False,False,False
4,P0005,42,27.203652,Hypertension,117/87,117,87,0.612934,False,False,True,False,False,False,False,False
5,P0006,29,27.203652,Hypertension,139/81,139,81,0.612934,True,False,False,False,False,False,False,False
6,P0007,21,20.15499,Diabetes,115/60,115,60,0.0,True,False,False,False,False,False,False,False


In [18]:

# 4. Perform one-hot encoding on the "Diagnosis" column.
data = pd.get_dummies(data, columns=['Diagnosis'])
data.head()

Unnamed: 0,Patient ID,Age,BMI,Blood Pressure,Systolic_BP,Diastolic_BP,Normalized_BMI,Age_Bin_20-30,Age_Bin_31-40,Age_Bin_41-50,Age_Bin_51-60,Age_Bin_61-70,Age_Bin_71-80,Age_Bin_81-90,Age_Bin_91-100,Diagnosis_Diabetes,Diagnosis_Hypertension
2,P0003,68,31.654859,98/70,98,70,1.0,False,False,False,False,True,False,False,False,False,True
3,P0004,22,27.203652,117/87,117,87,0.612934,True,False,False,False,False,False,False,False,False,True
4,P0005,42,27.203652,117/87,117,87,0.612934,False,False,True,False,False,False,False,False,False,True
5,P0006,29,27.203652,139/81,139,81,0.612934,True,False,False,False,False,False,False,False,False,True
6,P0007,21,20.15499,115/60,115,60,0.0,True,False,False,False,False,False,False,False,True,False
