# Hands-on exercises

## Data preparation 

In [None]:
%pylab inline

rcParams['figure.dpi'] = 300
rcParams['figure.facecolor'] = 'w'

import pandas as pd

data = pd.read_csv('../input/Health_Data.csv')

df = data[data.train]
del df['train']

In [None]:
df

## EDA

In [None]:
df.shape

In [None]:
df.dtypes

#### Convert `fasting_blood_sugar` to `str`

In [6]:
df_str = df.fasting_blood_sugar.to_string()

#### Missing values
- create a bar plot with the number of missing values

In [None]:

df.isna().sum().plot.bar(figsize=(5,5))
title('Number of missing values (np.NaN or None)')
grid()

#### Histograms

In [None]:
for col in df.columns:
    df[col].hist()
    title(col)
    show()


In [None]:
df.columns

In [None]:
figsize(5,5)
df.plot.box()
xticks(rotation=90)

---

## Going through the data feature by feature

### Age

- identify the outlier and replace it by the median value
- replace all missing values by the median age 
- convert age to integer

In [None]:
df.age.hist()
title('age')
show()

In [None]:
# identify outlayer
df.loc[(df.age>80),'age']=0

# replace nan 
df['age'] = df.age.replace(0, np.NaN)

In [None]:
#fill with meadian values
df['age'] = df['age'].fillna(df.age.median())

#Plot histogram
df.age.hist()

In [None]:
df.dtypes

To convert age to integer:

In [19]:
df["age"] = df["age"].astype(int)

In [None]:
df.dtypes

Plotting with seaborn

In [None]:
import seaborn as sns
figsize(5,5)
sns.distplot(df['age'])

### Sex
- standardize the values in the sex column 
- encode the values with 0: female and 1: male
- compare the age distributions for males and females

## chest_pain_type

- Create an OneHot-Encoding for the `chest_pain_type` column
- remove the old column and add the new column

In [None]:
df.chest_pain_type.hist(figsize=(5,4))
title('chest_pain_type')

### resting_blood_pressure
NO - identify possible outliers and replace them with the median value of the remaining data
- create a regression imputation using only the age column with `sklean.linear_models.LinearRegression`
- impute the missing values

### ST_depression_exercise_to_rest
- Verify the column has no missing values
- Identify the _special value_ in `ST_depression_exercise_to_rest`
- Generate a new binary column indicating the location of the special value
- Replace the special value with the median of the rest of the data

In [None]:
figsize=(2,2)
df.ST_depression_exercise_to_rest.hist(bins=30)
title('ST_depression_exercise_to_rest')

In [None]:

hist(df.ST_depression_exercise_to_rest, bins=10)
vlines(df.ST_depression_exercise_to_rest.mean(), 0, 120, lw=2, label='Mean', color='C2')
vlines(np.median(df.ST_depression_exercise_to_rest), 0, 120, lw=2, label='Median', color='C1')
legend()
grid()
title('Mean is sensitive to outliers')

In [None]:
df['ST_is_special'] = (df.ST_depression_exercise_to_rest == 0).astype(int) 

df[['ST_depression_exercise_to_rest','ST_is_special']]


In [None]:
df['ST_is_special'] = df.ST_depression_exercise_to_rest.replace(0, np.NaN)
df[['ST_depression_exercise_to_rest','ST_is_special']]

In [None]:
df['ST_is_special']=df['ST_is_special'].fillna(df.ST_depression_exercise_to_rest.median())
df.ST_is_special.hist()
title('New distribution; replaced special value')

### serum_cholesterol
- impute missing values with the median
- use `skewtest` to ceck if the `log1p` creates a more normal distribution 
- if that is the case replace the old values with the transformed values

In [None]:
df.serum_cholesterol.apply(log1p).hist()

### Create a function that creates all the changes at once