# Parsing the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib

%matplotlib inline

In [2]:
TEST_DATA_PATH = '../data/test.csv'
TRAIN_DATA_PATH = '../data/train.csv'

NEW_TEST_DATA_PATH = '../data/test_fixed.csv'
NEW_TRAIN_DATA_PATH = '../data/train_fixed.csv'

In [3]:
MIN_CORRELATION = 0.15
MAX_CORRELATION = 0.85

# Auxiliary methods

In [4]:
def create_fixed_dataframe_from_raw_data(df):
    '''
    Creates a new DataFrame in which each column's
    NaN values are replaced with the most likely
    existing values.
    '''
    fixed_df = df.replace(to_replace=-999.0, value=float('NaN'))
    fixed_df = fixed_df.set_index(['Id'], drop=True)
    
    # Get correlation matrices
    raw_corr_df = fixed_df.corr()
    abs_corr_df = fixed_df.corr().apply(lambda x: abs(x))
    
    # Remove columns' self correlation
    identity_df = pd.DataFrame(np.identity(len(fixed_df.columns)), raw_corr_df.columns, raw_corr_df.columns)
    raw_corr_df = raw_corr_df - identity_df
    abs_corr_df = abs_corr_df - identity_df
    
    # Fill NaN according to correlated columns
    fixed_df = fixed_df.apply(lambda x: replace_nan_with_max_correlation(abs_corr_df[x.name], raw_corr_df, fixed_df))
    
    # Drop columns with more than 50% NaNs
    #fixed_df = fixed_df.loc[:, (fixed_df.isnull().sum(axis=0) <= fixed_df.shape[0] / 2.0)]
    
    # Fill remaining NaNs with the column's mean value
    fixed_df = fixed_df.apply(lambda x: replace_nan_with_mean(x))
    
    return fixed_df.reset_index()

In [5]:
def replace_nan_with_max_correlation(abs_corr_column, raw_corr_df, data_df):
    '''
    Creates a new column in which every row's
    NaN values are replaced with its highest correlated
    column's values, provided the correlation is above 
    a certain threshold.
    '''
    max_correlation_index = abs_corr_column.idxmax()
    max_correlation_value = abs_corr_column.get(max_correlation_index)
    max_correlation_column = data_df[max_correlation_index]
    
    nan_filled_column = data_df[abs_corr_column.name]
    # Only replace if there is a high enough correlation between the columns
    if (max_correlation_value >= MIN_CORRELATION):
        mult = 1.0 if (raw_corr_df[abs_corr_column.name].get(max_correlation_index) >= 0.0) else -1.0
        nan_filled_column.fillna(mult * data_df[max_correlation_index], inplace=True)
        
    return nan_filled_column

In [6]:
def replace_nan_with_mean(column):
    return column.fillna(column.mean())

In [7]:
test_df = pd.read_csv(TEST_DATA_PATH)
train_df = pd.read_csv(TRAIN_DATA_PATH)

# Fixing testing data
test_prediction_column = test_df['Prediction']
fixed_test_df = create_fixed_dataframe_from_raw_data(test_df.drop(['Prediction'], axis=1))
fixed_test_df.insert(loc=1, column='Prediction', value=test_prediction_column)

# Fixing trainning data
train_prediction_column = train_df['Prediction']
fixed_train_df = create_fixed_dataframe_from_raw_data(train_df.drop(['Prediction'], axis=1))
fixed_train_df.insert(loc=1, column='Prediction', value=train_prediction_column)

In [8]:
# Write results to files
fixed_test_df.to_csv(NEW_TEST_DATA_PATH, index=False)
fixed_train_df.to_csv(NEW_TRAIN_DATA_PATH, index=False)