# Parsing data using Numpy

In [1]:
import numpy as np
import csv
import copy

In [2]:
IN_TEST_DATA_PATH = '../data/test.csv'
IN_TRAIN_DATA_PATH = '../data/train.csv'

OUT_TEST_DATA_PATH = '../data/test_fixed.csv'
OUT_TRAIN_DATA_PATH = '../data/train_fixed.csv'

In [3]:
class DataFrame:
    '''
    This class is used as a data-container,
    representeing column-organized information
    read from csv files.
    '''
    column_labels = {}
    data = None
    
    def __init__(self, csv_path):
        
        temp_data = None
        
        with open(csv_path) as csv_file:
            csv_reader =  csv.reader(csv_file)
            n_rows = sum(1 for row in csv_reader)
        
            # Reset reader's head pointer
            csv_file.seek(0)
            
            for row_idx, row in enumerate(csv_reader):
                if (row_idx == 0):
                    
                    # Fill in dictionary with (column_name:column_index)
                    for column_idx, column_label in enumerate(row):
                        self.column_labels[column_label] = column_idx
                    temp_data = [[0 for x in range(n_rows-1)] for y in range(len(self.column_labels))]
                else:
                    
                    # Fill data in a column-oriented fashion
                    for column_idx, column_value in enumerate(row):
                        temp_data[column_idx][row_idx-1] = column_value
        
        # Store all the data into an 'ndarray'
        self.data = np.array(temp_data)
    
    # targets have to be labels, not indices
    def get_columns(self, targets):
        '''
        Returns a copy the desired columns' data as a 
        list of Column objects.
        '''
        columns = []
        
        if all(isinstance(label, str) for label in targets):
            columns = [Column(label, self.data[self.column_labels[label],:]) for label in targets]
            
        return columns
    
    # target_axix=0 is columns, target_axix=1 is rows
    def drop(self, targets, target_axis=0):
        '''
        Returns a new DataFrame without the
        dropped columns/rows.
        '''
        dataframe_clone = self.__clone()
        dataframe_offset = len(targets)
        
        # All elements in the list are indexes
        if all(isinstance(index, int) for index in targets):
            dropable_keys = [key for key in dataframe_clone.column_labels if dataframe_clone.column_labels[key] in targets]
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in dropable_keys], axis=target_axis)
            dataframe_clone.column_labels = {k: v-dataframe_offset for k, v in dataframe_clone.items()}
        
        # All elements in the list are labels
        elif all(isinstance(label, str) for label in targets) and target_axis == 0:
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in targets], axis=target_axis)
            dataframe_clone.column_labels = {k: v-dataframe_offset for k, v in dataframe_clone.column_labels.items()}
        
        return dataframe_clone
    
    def replace(self, existing_value, new_value):
        '''
        Replaces all occurrences of a given value,
        in the DataFrame, witha new specified value.
        '''
        dataframe_clone = self.__clone()
        dataframe_clone.data[dataframe_clone.data == existing_value] = new_value
        return dataframe_clone
    
    def corr(self):
        '''
        Returns a 2D matrix with the correlation
        coeficients between all of the columns.
        '''
        return np.corrcoef(self.data)
    
    def set_type(self, target_type):
        '''
        Attempts to change the DataFrame's data
        type to a single type (target_type). The
        returned DataFrame is a copy of 'self'.
        '''
        dataframe_clone = self.__clone()
        dataframe_clone.data = dataframe_clone.data.astype(target_type)
        return dataframe_clone
    
    def __clone(self):
        '''
        Creates and returns a clone of the current
        DataFrame object (creating a deep copy of
        all its components).
        '''
        dataframe_clone = copy.deepcopy(self)
        dataframe_clone.column_labels = copy.deepcopy(self.column_labels)
        dataframe_clone.data = copy.deepcopy(self.data)
        return dataframe_clone
    
    def __repr__(self):
        '''
        Default class' representation.
        '''
        return str(self)
    
    def __str__(self):
        '''
        Default class' string representation.
        '''
        max_columns = min(6, len(self.column_labels.keys()))
        max_rows = min(8, len(self.data[0,:]))
        max_string_size = 13
        final_string = '| '
        
        # Add the schema to the top
        for idx, label in enumerate(self.column_labels.keys()):
            if (idx < max_columns):
                label_rep = label if (len(label) <= max_string_size) else label[:max_string_size-3] + '...'
                final_string += label_rep.rjust(max_string_size) + ' | '
            elif (idx == max_columns):
                final_string += '...'
            else:
                break
        
        final_string += '\n'
        final_string += '-' * (max_string_size * max_columns + (max_columns + 1) * 3)
        
        # Add the first rows as preview
        for i in range(max_rows):
            final_string += '\n| '
            for idx, value in enumerate(self.data[:,i]):
                if (idx < max_columns):
                    value_rep = str(value) if (len(str(value)) <= max_string_size) else str(value)[:max_string_size-3] + '...'
                    final_string += value_rep.rjust(max_string_size) + ' | '
                elif (idx == max_columns):
                    final_string += '...'
                else:
                    break
        
        final_string += '\n(...)\n'
        
        return final_string
    
class Column:
    '''
    This class is meant as single column's
    data representation.
    '''
    label = None
    values = None
    
    def __init__(self, label, values):
        self.label = label
        self.values = values

In [4]:
test_df = DataFrame(IN_TEST_DATA_PATH)
train_df = DataFrame(IN_TRAIN_DATA_PATH)

In [5]:
# Store 'Id' and 'Prediction' columns
test_id_column, test_prediction_column = test_df.get_columns(['Id', 'Prediction'])
trrain_id_column, train_prediction_column = train_df.get_columns(['Id', 'Prediction'])

# Drop 'Id' and 'Prediction' columns and replace '-999' with 'NaN'
test_df = test_df.drop(['Id', 'Prediction']).set_type(float).replace(-999, float('NaN'))
train_df = train_df.drop(['Id', 'Prediction']).set_type(float).replace(-999, float('NaN'))

# Create correlation matrices
test_corr_df = test_df.corr()
train_corr_df = train_df.corr()

In [7]:
test_df

|  DER_mass_MMC | DER_mass_t... |  DER_mass_vis |      DER_pt_h | DER_deltae... | DER_mass_j... | ...
---------------------------------------------------------------------------------------------------
|           nan |        79.589 |        23.916 |         3.036 |           nan |           nan | ...
|       106.398 |         67.49 |        87.949 |        49.994 |           nan |           nan | ...
|       117.794 |        56.226 |        96.358 |         4.137 |           nan |           nan | ...
|       135.861 |        30.604 |        97.288 |         9.104 |           nan |           nan | ...
|        74.159 |        82.772 |        58.731 |        89.646 |         1.347 |       536.663 | ...
|        95.709 |        94.168 |         66.28 |        14.719 |           nan |           nan | ...
|        85.798 |        49.059 |        66.131 |        37.074 |           nan |           nan | ...
|       429.273 |        75.057 |        234.61 |        71.019 |          0.59 |   

#### TODO:

    1. Remove 1/3 of the columns (the ones with the most NaNs)
    2. Fill in the data from the most correlated columns (above a certain coeficient)
    3. Normalize the data (subtract the mean and divide by the stddev)
    4. Fill in the rest with 0s (since that is the mean of the normalized data)