# Step 1: Data Processing

---

1. [Initial Preparation](#initial-preparation)
2. [Baseline Model](#baseline-model)
3. Data Cleaning
4. Data Transformation

In [1]:
# Imports and environment setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

from utils import header
from utils import show_distribution
from utils import show_aggregate_distribution

plt.style.use('ggplot')

## 1. Initial Preparation<a id="initial-preparation">

As discussed in [Step 1. EDA](./Step1.EDA.ipynb)

In [2]:
# Load datasets
df_train = pd.read_csv("../data/Paitients_Files_Train.csv")
df_test = pd.read_csv("../data/Paitients_Files_Test.csv")

# Dropping unused columns
df_train = df_train.drop(columns=['ID', 'Insurance'])  # Unused col
df_test = df_test.drop(columns=['ID', 'Insurance'])    # Unused col

# Fix incorrect column name spelling, because I am very particular about grammar and spelling :)
df_train = df_train.rename(columns={"Sepssis": "Sepsis"})

# Turn categorical values in the target Sepsis column into numerical values so that our model can work with the data
df_train["Sepsis"] = df_train["Sepsis"].map({"Negative": 0.0, "Positive": 1.0})

In [4]:
print(f"{header(9, 'SHAPE')}\n{df_train.shape}")
print(f"\n{header(19, 'NULL COUNT')}\n{df_train.isna().sum()}")
print(f"\n{header(39, 'COLUMNS OVERVIEW')}")
print(df_train.info())
df_train.head()

╔═══════╗
║ SHAPE ║
╚═══════╝
(599, 9)

╔═════════════════╗
║   NULL COUNT    ║
╚═════════════════╝
PRG       0
PL        0
PR        0
SK        0
TS        0
M11       0
BD2       0
Age       0
Sepsis    0
dtype: int64

╔═════════════════════════════════════╗
║          COLUMNS OVERVIEW           ║
╚═════════════════════════════════════╝
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PRG     599 non-null    int64  
 1   PL      599 non-null    int64  
 2   PR      599 non-null    int64  
 3   SK      599 non-null    int64  
 4   TS      599 non-null    int64  
 5   M11     599 non-null    float64
 6   BD2     599 non-null    float64
 7   Age     599 non-null    int64  
 8   Sepsis  599 non-null    float64
dtypes: float64(3), int64(6)
memory usage: 42.2 KB
None


Unnamed: 0,PRG,PL,PR,SK,TS,M11,BD2,Age,Sepsis
0,6,148,72,35,0,33.6,0.627,50,1.0
1,1,85,66,29,0,26.6,0.351,31,0.0
2,8,183,64,0,0,23.3,0.672,32,1.0
3,1,89,66,23,94,28.1,0.167,21,0.0
4,0,137,40,35,168,43.1,2.288,33,1.0


In [5]:
print(f"{header(9, 'SHAPE')}\n{df_test.shape}")
print(f"\n{header(19, 'NULL COUNT')}\n{df_test.isna().sum()}")
print(f"\n{header(39, 'COLUMNS OVERVIEW')}")
print(df_test.info())
df_train.head()

╔═══════╗
║ SHAPE ║
╚═══════╝
(169, 8)

╔═════════════════╗
║   NULL COUNT    ║
╚═════════════════╝
PRG    0
PL     0
PR     0
SK     0
TS     0
M11    0
BD2    0
Age    0
dtype: int64

╔═════════════════════════════════════╗
║          COLUMNS OVERVIEW           ║
╚═════════════════════════════════════╝
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PRG     169 non-null    int64  
 1   PL      169 non-null    int64  
 2   PR      169 non-null    int64  
 3   SK      169 non-null    int64  
 4   TS      169 non-null    int64  
 5   M11     169 non-null    float64
 6   BD2     169 non-null    float64
 7   Age     169 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 10.7 KB
None


Unnamed: 0,PRG,PL,PR,SK,TS,M11,BD2,Age,Sepsis
0,6,148,72,35,0,33.6,0.627,50,1.0
1,1,85,66,29,0,26.6,0.351,31,0.0
2,8,183,64,0,0,23.3,0.672,32,1.0
3,1,89,66,23,94,28.1,0.167,21,0.0
4,0,137,40,35,168,43.1,2.288,33,1.0


In [None]:
print(f"Missing values count: {df_test.isna().sum().sum() + df_train.isna().sum().sum()}")

## 2. Baseline Model<a id="baseline-model"></a>