In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

## Loading data
First we have to load data from the various tab-separated files

In [2]:
path = 'data/raw/'
df = pd.DataFrame()
df_list = [pd.read_csv(path + file ,sep='\t',header=None) for file in os.listdir(path)]
df = pd.concat(df_list)


In [3]:
df.columns = ['date', 'time', 'code', 'value']

## Data description
The data set contains a diabetes patient record
Columns:
1. date
2. time
3. code describes (#TODO)
4. value (#TODO)

Codes can be lookuped in a second dataframe (df_codes)

In [4]:
df.head()

Unnamed: 0,date,time,code,value
0,05-20-1991,08:00,58.0,101
1,05-20-1991,08:00,33.0,5
2,05-20-1991,08:00,34.0,27
3,05-20-1991,12:00,60.0,89
4,05-20-1991,12:00,33.0,3


In [5]:
df.isnull().any()

date     True
time     True
code     True
value    True
dtype: bool

We have null values in every column

In [6]:
nan_rows_date = df[df['date'].isnull()]
nan_rows_time = df[df['time'].isnull()]
nan_rows_code = df[df['code'].isnull()]
nan_rows_value = df[df['value'].isnull()]
print(f'nulls in date: {len(nan_rows_date)}')

nulls in date: 34


In [7]:
before = len(df)
df_clean = df.dropna()
after = len(df_clean)
print(f'removed {before-after} entries, length dataframe without null: {after}')

removed 67 entries, length dataframe without null: 29264


In [8]:
df_clean.isnull().any()

date     False
time     False
code     False
value    False
dtype: bool

### descriptive analysis, cleansing and outliers

#### General statistics and correlations

In [None]:
print(f'Amount data points: {len(df)}')

#### Date statistics

In [None]:
amount_dates = df['date'].nunique()
print(f'Amount days (uncleaned): {amount_dates}')

In [None]:
df.groupby('date').size().hist(bins=20)

In [None]:
min_per_day = df.groupby('date').size().min()
max_per_day = df.groupby('date').size().max()
print(f'Min. data points per day: {min_per_day}')
print(f'Min. data points per day: {max_per_day}')
print(df.groupby('date').size().describe(percentiles=[.75,.80,.95,.99]))

### data set description
1. The amount of data points per day is right-skewed -> 75% of days have less than 31, the remaining days hold between 32 and max 126 points
2. 

#### Time statistics

In [None]:
amount_times = df['time'].nunique()
print(f'Amount unique points in time (uncleaned): {amount_times}')

#### Code statistics

In [None]:
amount_codes = df['code'].nunique()
print(f'Amount unique points in code (uncleaned): {amount_codes}')

#### Value statistics

In [None]:
amount_values = df['value'].nunique()
print(f'Amount unique points in value (uncleaned): {amount_values}')

## Transformation and feature engineering

When Should I Use Log Transformation?
Many possible transformations exist. However, you should only use a log transformation if:

Your data is highly skewed to the right (i.e. in the positive direction).
(https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/skewed-distribution/)