# Parsing data using Numpy

In [1]:
from data_handler import DataFrame, Column

In [2]:
IN_TEST_DATA_PATH = '../data/test.csv'
IN_TRAIN_DATA_PATH = '../data/train.csv'

OUT_TEST_DATA_PATH = '../data/test_fixed.csv'
OUT_TRAIN_DATA_PATH = '../data/train_fixed.csv'

In [3]:
test_df = DataFrame(IN_TEST_DATA_PATH)
train_df = DataFrame(IN_TRAIN_DATA_PATH)

In [4]:
# Store 'Id' and 'Prediction' columns
test_id_column, test_prediction_column = test_df.get_columns(['Id', 'Prediction'])
train_id_column, train_prediction_column = train_df.get_columns(['Id', 'Prediction'])

# Drop 'Id' and 'Prediction' columns and replace '-999' with 'NaN'
test_df = test_df.drop(['Id', 'Prediction']).as_type(float).replace(-999.0, float('NaN'))
train_df = train_df.drop(['Id', 'Prediction']).as_type(float).replace(-999.0, float('NaN'))

# Create correlation matrices
test_corr_df = test_df.replace(float('NaN'), 0.0).corr()
train_corr_df = train_df.replace(float('NaN'), 0.0).corr()

In [5]:
# Get NaN count for each column
nan_count = {}
for column in test_df.get_columns():
    nan_count[column.label] = len(column.values) - len(column.nonan().values)
    
# Get columns sorted by NaN count
sorted_nan_count = sorted(nan_count, key=nan_count.get)

# Pick 1/3 worst columns to remove
target_columns = sorted_nan_count[int(2 * len(sorted_nan_count) / 3):]

# Drop columns from DataFrames
test_df = test_df.drop(target_columns)
train_df = train_df.drop(target_columns)

In [6]:
# Normalizing dataframes
test_df = test_df.normalize()
train_df = train_df.normalize()

# Replacing NaNs with 0s
test_df = test_df.replace(float('NaN'), 0.0).round_values()
train_df = train_df.replace(float('NaN'), 0.0).round_values()

In [7]:
# Add 'Id' and 'Prediction' columns back into the dataframes
test_df = test_df.as_type(str).insert([test_id_column, test_prediction_column], [0, 1])
train_df = train_df.as_type(str).insert([train_id_column, train_prediction_column], [0, 1])

In [8]:
test_df

|            Id |    Prediction |  DER_mass_MMC | DER_mass_t... |  DER_mass_vis |      DER_pt_h | ...
---------------------------------------------------------------------------------------------------
|        350000 |             ? |           0.0 |         0.857 |       -1.4134 |       -0.8655 | ...
|        350001 |             ? |       -0.2725 |        0.5151 |        0.1687 |       -0.1238 | ...
|        350002 |             ? |       -0.0718 |        0.1969 |        0.3764 |       -0.8482 | ...
|        350003 |             ? |        0.2464 |       -0.5271 |        0.3994 |       -0.7697 | ...
|        350004 |             ? |       -0.8402 |        0.9469 |       -0.5532 |        0.5026 | ...
|        350005 |             ? |       -0.4607 |        1.2689 |       -0.3667 |        -0.681 | ...
|        350006 |             ? |       -0.6353 |       -0.0056 |       -0.3704 |       -0.3279 | ...
|        350007 |             ? |        5.4134 |        0.7289 |        3.7923 |   

In [9]:
train_df

|            Id |    Prediction |  DER_mass_MMC | DER_mass_t... |  DER_mass_vis |      DER_pt_h | ...
---------------------------------------------------------------------------------------------------
|        100000 |             s |        0.2899 |        0.0683 |        0.4077 |         -0.47 | ...
|        100001 |             b |         0.682 |        0.5525 |        0.5401 |       -0.1532 | ...
|        100002 |             b |           0.0 |        3.1952 |        1.0966 |       -0.3497 | ...
|        100003 |             b |        0.3848 |        0.9104 |       -0.0059 |        -0.903 | ...
|        100004 |             b |        0.9425 |       -0.9146 |        1.3134 |       -0.6518 | ...
|        100005 |             b |       -0.5605 |       -1.0098 |       -0.5396 |        0.9182 | ...
|        100006 |             s |        0.4694 |       -0.5765 |        0.6515 |        0.7577 | ...
|        100007 |             s |        0.5769 |       -1.0984 |        0.3314 |   

In [10]:
# Write dataframes to csv files
test_df.write_to_csv(OUT_TEST_DATA_PATH)
train_df.write_to_csv(OUT_TRAIN_DATA_PATH)