# Data Cleansing

In [25]:
%%javascript
IPython.notebook.clear_all_output();

<IPython.core.display.Javascript object>

# Import Dataset

In [26]:
import pandas as pd

dataset = pd.read_csv("MRH_preprocess.csv")
dataset.head()

Unnamed: 0,s_no,age,male,female,white,black,hypertension_diagnosis_age,dyslipidemia,chf,arrythmia,...,psg_highest_heart_rate_observed,psg_asystole,psg_longest_pause_duration,psg_narrow_complex_tachycardia,psg_highest_heart_rate_observed_nct,psg_wide_complex_tachycardia,psg_highest_heart_rate_observed_wct,psg_atrial_fibrillation,outcome_death,Unnamed: 635
0,1,64.299632,0,1,0,1,40.0,1,0,0,...,,0.0,,0.0,,0.0,,0.0,,
1,2,40.449724,1,0,0,1,25.0,1,0,0,...,,,,,,,,,,
2,3,54.325437,0,1,0,1,34.0,1,0,0,...,,0.0,,0.0,,0.0,,0.0,,
3,4,47.143907,0,1,1,0,36.0,0,0,0,...,,0.0,,0.0,,0.0,,0.0,,
4,5,38.700316,0,1,0,1,30.0,0,0,0,...,,0.0,,0.0,,0.0,,0.0,,


In [27]:
# keep only some colums for illustration
colums = ['hypertension_diagnosis_age', 'psg_tst_total_lms_index', 'thiazide_ma', 'abpm_overall_sbp_sd']

dataset = dataset[colums]
dataset.head(10)

Unnamed: 0,hypertension_diagnosis_age,psg_tst_total_lms_index,thiazide_ma,abpm_overall_sbp_sd
0,40.0,0.0,Chlorthalidone 12.5mg,15.2
1,25.0,,,18.0
2,34.0,0.0,Chlorthalidone 25mg,17.2
3,36.0,0.3,,28.2
4,30.0,0.0,Chlorthalidone 25mg,24.6
5,40.0,0.0,,22.0
6,42.0,59.6,Chlorthalidone 12.5mg,15.4
7,30.0,0.0,,13.2
8,45.0,0.0,Chlorthalidone 25mg,23.6
9,26.0,24.6,Chlorthalidone 25mg,17.0


In [28]:
# you can add a column easily
dataset['age/10'] = dataset['hypertension_diagnosis_age'] / 10
dataset.head()

Unnamed: 0,hypertension_diagnosis_age,psg_tst_total_lms_index,thiazide_ma,abpm_overall_sbp_sd,age/10
0,40.0,0.0,Chlorthalidone 12.5mg,15.2,4.0
1,25.0,,,18.0,2.5
2,34.0,0.0,Chlorthalidone 25mg,17.2,3.4
3,36.0,0.3,,28.2,3.6
4,30.0,0.0,Chlorthalidone 25mg,24.6,3.0


In [29]:
# and easily delete a column
dataset.drop(columns=['hypertension_diagnosis_age'], inplace=True)
dataset.head()

Unnamed: 0,psg_tst_total_lms_index,thiazide_ma,abpm_overall_sbp_sd,age/10
0,0.0,Chlorthalidone 12.5mg,15.2,4.0
1,,,18.0,2.5
2,0.0,Chlorthalidone 25mg,17.2,3.4
3,0.3,,28.2,3.6
4,0.0,Chlorthalidone 25mg,24.6,3.0


## Count Missing Data

In [30]:
dataset.count()

psg_tst_total_lms_index    181
thiazide_ma                166
abpm_overall_sbp_sd        218
age/10                     210
dtype: int64

In [31]:
miss = len(dataset) - dataset.count()
miss.head(10)

psg_tst_total_lms_index    56
thiazide_ma                71
abpm_overall_sbp_sd        19
age/10                     27
dtype: int64

## Data Cleansing

In [32]:
# see column types
dataset.dtypes

psg_tst_total_lms_index     object
thiazide_ma                 object
abpm_overall_sbp_sd         object
age/10                     float64
dtype: object

In [33]:
pd.to_numeric(dataset['psg_tst_total_lms_index'], errors='raise')

ValueError: Unable to parse string "28..3" at position 55

In [34]:
dataset['psg_tst_total_lms_index'][50:60]

50     38.9
51     35.8
52      6.4
53      0.1
54      0.3
55    28..3
56      0.2
57       42
58      4.3
59        0
Name: psg_tst_total_lms_index, dtype: object

In [35]:
dataset['psg_tst_total_lms_index'][55] = 28.3
dataset['psg_tst_total_lms_index'] = pd.to_numeric(dataset['psg_tst_total_lms_index'], errors='raise')
dataset.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


psg_tst_total_lms_index    float64
thiazide_ma                 object
abpm_overall_sbp_sd         object
age/10                     float64
dtype: object

In [36]:
pd.to_numeric(dataset['abpm_overall_sbp_sd'], errors='raise')

ValueError: Unable to parse string "19:03" at position 98

In [37]:
dataset['abpm_overall_sbp_sd'][95:100]

95     18.3
96     10.8
97     10.7
98    19:03
99     12.6
Name: abpm_overall_sbp_sd, dtype: object

In [38]:
dataset['abpm_overall_sbp_sd'][98] = 19.03
dataset['abpm_overall_sbp_sd'] = pd.to_numeric(dataset['abpm_overall_sbp_sd'], errors='raise')
dataset.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


psg_tst_total_lms_index    float64
thiazide_ma                 object
abpm_overall_sbp_sd        float64
age/10                     float64
dtype: object

In [39]:
dataset['thiazide_ma'].value_counts()

Chlorthalidone 25mg                   105
Chlorthalidone 12.5mg                  29
HCTZ 25mg                              12
HCTZ 12.5mg                             7
HCTZ 50mg                               3
Chlothalidone 25mg                      1
HCTZ 6.25mg                             1
Chlorthalidone 12.5 25mg                1
Chlorhalidone 25mg                      1
Chlorthalidone 12.5mg x 3days/week      1
HCTZ 37.5mg                             1
Hydrochlorothiazide 12.5mg              1
HCTZ 25mg Chlorthalidone                1
HCTZ 12.5mg BID                         1
Chlorthalidone 50mg                     1
Name: thiazide_ma, dtype: int64

In [40]:
dataset['thiazide_ma'].replace('Chlorthalidone 25mg', 1, inplace=True)
dataset['thiazide_ma'].replace('Chlorthalidone 12.5mg', 1, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 25mg', 2, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 12.5mg', 2, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 50mg', 2, inplace=True)
dataset['thiazide_ma'].replace('Chlorthalidone 25/12.5mg', 1, inplace=True)
dataset['thiazide_ma'].replace('Chlothalidone 25mg', 1, inplace=True)
dataset['thiazide_ma'].replace('Chlorthalidone 50mg', 1, inplace=True)
dataset['thiazide_ma'].replace('Chlorthalidone 12.5 25mg', 1, inplace=True)
dataset['thiazide_ma'].replace('Chlorthalidone 12.5mg x 3days/week', 1, inplace=True)
dataset['thiazide_ma'].replace('Hydrochlorothiazide 12.5mg', 2, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 6.25mg', 2, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 12.5mg BID', 2, inplace=True)
dataset['thiazide_ma'].replace('Chlorhalidone 25mg', 1, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 37.5mg', 2, inplace=True)
dataset['thiazide_ma'].replace('HCTZ 25mg Chlorthalidone', 3, inplace=True)
dataset['thiazide_ma'].value_counts()

1.0    139
2.0     26
3.0      1
Name: thiazide_ma, dtype: int64

In [41]:
dataset

Unnamed: 0,psg_tst_total_lms_index,thiazide_ma,abpm_overall_sbp_sd,age/10
0,0.0,1.0,15.2,4.0
1,,,18.0,2.5
2,0.0,1.0,17.2,3.4
3,0.3,,28.2,3.6
4,0.0,1.0,24.6,3.0
...,...,...,...,...
232,,,,5.9
233,,2.0,14.5,2.5
234,,2.0,,4.5
235,29.5,,24.6,2.2


## You may save your result for later use

In [42]:
dataset.to_csv('cleansed.csv', index=False)

In [None]:
# The End