In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from datetime import datetime

In [2]:
# Some global variables

data_folder = "../Data/"
features = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity']

In [3]:
# Read training data and test data for preparation

train_df = pd.read_csv(data_folder + 'train.csv')
test_df = pd.read_csv(data_folder + 'test.csv')

In [4]:
# Create a copy of the training and test data for cleaning 

res_train_df = train_df.copy()
res_test_df = test_df.copy()

In [5]:
def bf_fill_missing_values(df, feature_prefix, original_df, print_stats=False):
    '''
    Fill missing values in a sample with the next seen value for the given feature.
    If next seen value is not availble, then fill with last seen value.
    
    df: DataFrame containing the data
    feature_prefix: Prefix of the feature columns to fill missing values for
    original_df: Original DataFrame before filling missing values
    print_stats: Boolean to print statistics before and after filling missing values
    '''
    
    # Select relevant columns
    columns = [col for col in df.columns if col.startswith(feature_prefix)]
    
    # Backfill missing values in a sample
    df[columns] = df[columns].fillna(method='bfill', axis=1)
    
    # Ffill missing values in a sample
    # Useful for test data without last value in the sample
    df[columns] = df[columns].fillna(method='ffill', axis=1)
    
    if print_stats:
        print("********************************")
        print("Before filling "+feature_prefix+" empty values with the next seen value")
        print(original_df[columns].shape)
        print(original_df[columns].head())
        print("Missing values before backfilling "+feature_prefix+":", original_df[columns].isnull().sum().sum())
        print("After filling "+feature_prefix+" empty values with the next seen value")
        print(df[columns].shape)
        print(df[columns].head())
        print("Missing values after backfilling "+feature_prefix+":", df[columns].isnull().sum().sum())
        print("********************************")
    return df

def fill_missing_values(df, feature_prefix, value, original_df, print_stats=False):
    '''
    Fill missing values in a sample (for the given feature) based on the provided value.
    
    df: DataFrame containing the data
    feature_prefix: Prefix of the feature columns to fill missing values for
    value: Value to fill missing values with
    original_df: Original DataFrame before filling missing values
    print_stats: Boolean to print statistics before and after filling missing values
    '''
    
    # Select relevant columns
    columns = [col for col in df.columns if col.startswith(feature_prefix)]
    
    # Fill missing values in a sample
    df[columns] = df[columns].fillna(value)
    
    if print_stats:
        print("********************************")
        print("Before filling "+feature_prefix+" empty values with", value)
        print(original_df[columns].shape)
        print(original_df[columns].head())
        print("Missing values before filling "+feature_prefix+":", original_df[columns].isnull().sum().sum())
        print("After filling "+feature_prefix+" empty values with", value)
        print(df[columns].shape)
        print(df[columns].head())
        print("Missing values after filling "+feature_prefix+":", df[columns].isnull().sum().sum())
        print("********************************")
    return df   
    

In [6]:
# For selected features, fill the missing values with the next seen value in the same sample.
# If next seen value is not available, fill with last seen value.

for feature in features:
    
    if feature not in ['bg']:
        continue
    
    res_train_df = bf_fill_missing_values(res_train_df, feature, train_df, print_stats=False)
    res_test_df = bf_fill_missing_values(res_test_df, feature, test_df, print_stats=False)

In [7]:
# For selected features, fill missing values with given value

default_value = {'insulin': 0, 'carbs': 0, 'hr': -1, 'steps': 0, 'cals': 0, 'activity': 'Not Available'}

for feature in features:
    
    if feature not in default_value.keys():
        continue
    
    res_train_df = fill_missing_values(res_train_df, feature, default_value[feature], train_df, print_stats=False)
    res_test_df = fill_missing_values(res_test_df, feature, default_value[feature], test_df, print_stats=False)

In [8]:
# Save the cleaned data

current_date = datetime.now().strftime('%Y%m%d')

res_test_df.to_csv(data_folder + current_date + '_test_cleaned.csv', index=False)
res_train_df.to_csv(data_folder + current_date + '_train_cleaned.csv', index=False)