# Parsing data using Numpy

In [1]:
import numpy as np
import csv
import copy

In [2]:
IN_TEST_DATA_PATH = '../data/test.csv'
IN_TRAIN_DATA_PATH = '../data/train.csv'

OUT_TEST_DATA_PATH = '../data/test_fixed.csv'
OUT_TRAIN_DATA_PATH = '../data/train_fixed.csv'

In [3]:
class DataFrame:
    '''
    This class is used as a data-container,
    representeing column-organized information
    read from csv files.
    '''
    column_labels = {}
    data = None
    
    def __init__(self, csv_path=None):
        
        # Leave the labels and data empty if the path is None
        if csv_path is None:
            return
        
        temp_data = None
        
        with open(csv_path) as csv_file:
            csv_reader =  csv.reader(csv_file)
            n_rows = sum(1 for row in csv_reader)
        
            # Reset reader's head pointer
            csv_file.seek(0)
            
            for row_idx, row in enumerate(csv_reader):
                if (row_idx == 0):
                    
                    # Fill in dictionary with (column_name:column_index)
                    for column_idx, column_label in enumerate(row):
                        self.column_labels[column_label] = column_idx
                    temp_data = [[0 for x in range(n_rows-1)] for y in range(len(self.column_labels))]
                else:
                    
                    # Fill data in a column-oriented fashion
                    for column_idx, column_value in enumerate(row):
                        temp_data[column_idx][row_idx-1] = column_value
        
        # Store all the data into an 'ndarray'
        self.data = np.array(temp_data)
    
    # targets have to be labels, not indices
    def get_columns(self, targets=None):
        '''
        Returns a copy the desired columns' data as a 
        list of Column objects.
        '''
        columns = []
        
        if targets is not None:
            if all(isinstance(label, str) for label in targets):
                columns = [Column(label, self.data[self.column_labels[label],:]) for label in targets]
        else:
            columns = [Column(label, self.data[self.column_labels[label],:]) for label in self.column_labels]
            
        return columns
    
    # 'at' cannot have repeated values
    def insert(self, columns, at):
        '''
        Returns a new DataFrame with newly
        inserted columns, at a designated index.
        '''
        dataframe_clone = self.__clone()
        
        # Zip columns and their desired indexes together for iteration
        col_idx_zipped = zip(columns, at)
        col_idx_zipped.sort(key = lambda t: t[1])
        
        # Create a new dictionary of labels to append
        new_labels = dict(col_idx_zipped)
        
        # Insert the columns' data
        for column, idx in col_idx_zipped:
            dataframe_clone.data = np.insert(dataframe_clone.data, idx, column.values, 0)
        
        # Apply index offset where needed
        for column, idx in col_idx_zipped:
            offset_column_labels = {}
            for k, v in dataframe_clone.column_labels.items():
                offset_column_labels[k] = (v + 1) if (v >= idx) else v
            dataframe_clone.column_labels = offset_column_labels
           
        # Append the new labels to previously existing ones
        dataframe_clone.column_labels = dict(dataframe_clone.column_labels, new_labels)
        
        return dataframe_clone
    
    # target_axix=0 is columns, target_axix=1 is rows
    def drop(self, targets, target_axis=0):
        '''
        Returns a new DataFrame without the
        dropped columns/rows.
        '''
        dataframe_clone = self.__clone()
        dropable_indexes = []
        
        # All elements in the list are indexes
        if all(isinstance(index, int) for index in targets):
            dropable_indexes = targets
            dropable_keys = [key for key in dataframe_clone.column_labels if dataframe_clone.column_labels[key] in targets]
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in dropable_keys], axis=target_axis)
        
        # All elements in the list are labels
        elif all(isinstance(label, str) for label in targets) and target_axis == 0:
            dropable_indexes = [self.column_labels[label] for label in targets]
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in targets], axis=target_axis)
        
        # Non-expected parameters, just return the whole DataFrame
        else:
            return dataframe_clone
        
        # Apply index offset where needed
        dropable_indexes.sort(reverse=True)
        for idx in dropable_indexes:
            offset_column_labels = {}
            for k, v in dataframe_clone.column_labels.items():
                offset_column_labels[k] = (v - 1) if (v >= idx) else v
            dataframe_clone.column_labels = copy.deepcopy(offset_column_labels)
        
        return dataframe_clone
    
    def replace(self, existing_value, new_value):
        '''
        Replaces all occurrences of a given value,
        in the DataFrame, witha new specified value.
        '''
        dataframe_clone = self.__clone()
        if np.isnan(existing_value):
            dataframe_clone.data[np.isnan(dataframe_clone.data)] = new_value
        else:
            dataframe_clone.data[dataframe_clone.data == existing_value] = new_value
        return dataframe_clone
    
    def mean(self):
        '''
        Returns a new DataFrame with the columns'
        mean values.
        '''
        mean_df = DataFrame()
        mean_df.column_labels = copy.deepcopy(self.column_labels)
        
        columns = self.get_columns()
        
        temp_data = [[0 for x in range(1)] for y in range(len(self.column_labels))]
        for column in columns:
            temp_data[self.column_labels[column.label]][0] = column.mean()
            
        mean_df.data = np.array(temp_data)
        
        return mean_df
    
    def std(self):
        '''
        Returns a new DataFrame with the columns'
        stadard deviation values.
        '''
        std_df = DataFrame()
        std_df.column_labels = copy.deepcopy(self.column_labels)
        
        columns = self.get_columns()
        
        temp_data = [[0 for x in range(1)] for y in range(len(self.column_labels))]
        for column in columns:
            temp_data[self.column_labels[column.label]][0] = column.std()
            
        std_df.data = np.array(temp_data)
        
        return std_df
    
    def normalize(self):
        '''
        Returns a DataFrame with column-based
        normalization.
        '''
        normalized_df = DataFrame()
        normalized_df.column_labels = copy.deepcopy(self.column_labels)
        
        temp_data = [[0 for x in range(len(self.data[0,:]))] for y in range(len(self.column_labels))]
        for column in self.get_columns():
            temp_data[normalized_df.column_labels[column.label]] = column.normalize().values
        
        normalized_df.data = np.array(temp_data)
        
        return normalized_df
    
    def corr(self):
        '''
        Returns a DataFrame with the correlation
        coeficients between all of the columns.
        '''
        corr_df = DataFrame()
        corr_df.column_labels = copy.deepcopy(self.column_labels)
        corr_df.data = np.corrcoef(self.data)
        
        return corr_df
    
    def set_type(self, target_type):
        '''
        Attempts to change the DataFrame's data
        type to a single type (target_type). The
        returned DataFrame is a copy of 'self'.
        '''
        dataframe_clone = self.__clone()
        dataframe_clone.data = dataframe_clone.data.astype(target_type)
        return dataframe_clone
    
    def __clone(self):
        '''
        Creates and returns a clone of the current
        DataFrame object (creating a deep copy of
        all its components).
        '''
        dataframe_clone = copy.deepcopy(self)
        dataframe_clone.column_labels = copy.deepcopy(self.column_labels)
        dataframe_clone.data = copy.deepcopy(self.data)
        return dataframe_clone
    
    def __repr__(self):
        '''
        Default class' representation.
        '''
        return str(self)
    
    def __str__(self):
        '''
        Default class' string representation.
        '''
        max_columns = min(6, len(self.column_labels.keys()))
        max_rows = min(8, len(self.data[0,:]))
        max_string_size = 13
        final_string = '| '
        
        # Add the schema to the top
        for idx, label in enumerate(self.column_labels.keys()):
            if (idx < max_columns):
                label_rep = label if (len(label) <= max_string_size) else label[:max_string_size-3] + '...'
                final_string += label_rep.rjust(max_string_size) + ' | '
            elif (idx == max_columns):
                final_string += '...'
            else:
                break
        
        final_string += '\n'
        final_string += '-' * (max_string_size * max_columns + (max_columns + 1) * 3)
        
        # Add the first rows as preview
        for i in range(max_rows):
            final_string += '\n| '
            for idx, value in enumerate(self.data[:,i]):
                if (idx < max_columns):
                    value_rep = str(value) if (len(str(value)) <= max_string_size) else str(value)[:max_string_size-3] + '...'
                    final_string += value_rep.rjust(max_string_size) + ' | '
                elif (idx == max_columns):
                    final_string += '...'
                else:
                    break
        
        if (len(self.data[0,:]) > max_rows):
            final_string += '\n(...)\n'
        else:
            final_string += '\n'
        
        return final_string
    
class Column:
    '''
    This class is meant as single column's
    data representation.
    '''
    label = None
    values = None
    
    def __init__(self, label, values):
        self.label = label
        self.values = values
        
    def mean(self):
        '''
        Calculates the column's values mean,
        while ignoring NaNs.
        '''
        return np.nanmean(self.values)
    
    def std(self):
        '''
        Calculates the column's values stadrad
        deviation, while ignoring NaNs.
        '''
        return np.nanstd(self.values)
    
    def normalize(self):
        '''
        Returns a column with normalized values.
        '''
        column_clone = self.__clone()
        column_clone.values = column_clone.values - column_clone.mean()
        column_clone.values = column_clone.values / column_clone.std()
        return column_clone
    
    def nonan(self):
        '''
        Returns an Column with all the non NaN
        values.
        '''
        column_clone = self.__clone()
        column_clone.values = column_clone.values[~np.isnan(column_clone.values)]
        return column_clone
    
    def __clone(self):
        column_clone = copy.deepcopy(self)
        column_clone.label = self.label
        column_clone.values = copy.deepcopy(self.values)
        return column_clone

In [4]:
test_df = DataFrame(IN_TEST_DATA_PATH)
train_df = DataFrame(IN_TRAIN_DATA_PATH)

In [5]:
# Store 'Id' and 'Prediction' columns
test_id_column, test_prediction_column = test_df.get_columns(['Id', 'Prediction'])
trrain_id_column, train_prediction_column = train_df.get_columns(['Id', 'Prediction'])

# Drop 'Id' and 'Prediction' columns and replace '-999' with 'NaN'
test_df = test_df.drop(['Id', 'Prediction']).set_type(float).replace(-999.0, float('NaN'))
train_df = train_df.drop(['Id', 'Prediction']).set_type(float).replace(-999.0, float('NaN'))

# Create correlation matrices
test_corr_df = test_df.replace(float('NaN'), 0.0).corr()
train_corr_df = train_df.replace(float('NaN'), 0.0).corr()

Dropping indexes: [1, 0]
Adjusting 1...
Result: {'DER_mass_MMC': 1, 'DER_mass_transverse_met_lep': 2, 'DER_mass_vis': 3, 'DER_pt_h': 4, 'DER_deltaeta_jet_jet': 5, 'DER_mass_jet_jet': 6, 'DER_prodeta_jet_jet': 7, 'DER_deltar_tau_lep': 8, 'DER_pt_tot': 9, 'DER_sum_pt': 10, 'DER_pt_ratio_lep_tau': 11, 'DER_met_phi_centrality': 12, 'DER_lep_eta_centrality': 13, 'PRI_tau_pt': 14, 'PRI_tau_eta': 15, 'PRI_tau_phi': 16, 'PRI_lep_pt': 17, 'PRI_lep_eta': 18, 'PRI_lep_phi': 19, 'PRI_met': 20, 'PRI_met_phi': 21, 'PRI_met_sumet': 22, 'PRI_jet_num': 23, 'PRI_jet_leading_pt': 24, 'PRI_jet_leading_eta': 25, 'PRI_jet_leading_phi': 26, 'PRI_jet_subleading_pt': 27, 'PRI_jet_subleading_eta': 28, 'PRI_jet_subleading_phi': 29, 'PRI_jet_all_pt': 30}
Adjusting 0...
Result: {'DER_mass_MMC': 0, 'DER_mass_transverse_met_lep': 1, 'DER_mass_vis': 2, 'DER_pt_h': 3, 'DER_deltaeta_jet_jet': 4, 'DER_mass_jet_jet': 5, 'DER_prodeta_jet_jet': 6, 'DER_deltar_tau_lep': 7, 'DER_pt_tot': 8, 'DER_sum_pt': 9, 'DER_pt_ratio_lep

In [6]:
len(test_df.replace(float('NaN'), 0.0).corr().get_columns())

30

In [7]:
test_df.mean()

|  DER_mass_MMC | DER_mass_t... |  DER_mass_vis |      DER_pt_h | DER_deltae... | DER_mass_j... | ...
---------------------------------------------------------------------------------------------------
| 121.871729343 | 49.2583872444 | 81.1223376772 | 57.8290937019 | 2.40501628365 | 372.355428652 | ...

In [8]:
test_df.std()

|  DER_mass_MMC | DER_mass_t... |  DER_mass_vis |      DER_pt_h | DER_deltae... | DER_mass_j... | ...
---------------------------------------------------------------------------------------------------
| 56.7853497841 |  35.393433862 | 40.4739995151 | 63.3043943928 |  1.7426863849 | 398.470091756 | ...

In [9]:
# Get NaN count for each column
nan_count = {}
for column in test_df.get_columns():
    nan_count[column.label] = len(column.values) - len(column.nonan().values)
    
# Get columns sorted by NaN count
sorted_nan_count = sorted(nan_count, key=nan_count.get)

# Pick 1/3 worst columns to remove
target_columns = sorted_nan_count[int(2 * len(sorted_nan_count) / 3):]

# Drop columns from DataFrames
test_df = test_df.drop(target_columns)
train_df = train_df.drop(target_columns)

Dropping indexes: [28, 27, 26, 25, 24, 23, 12, 6, 5, 4]
Adjusting 28...
Result: {'DER_mass_MMC': 0, 'DER_mass_transverse_met_lep': 1, 'DER_mass_vis': 2, 'DER_pt_h': 3, 'DER_deltar_tau_lep': 7, 'DER_pt_tot': 8, 'DER_sum_pt': 9, 'DER_pt_ratio_lep_tau': 10, 'DER_met_phi_centrality': 11, 'PRI_tau_pt': 13, 'PRI_tau_eta': 14, 'PRI_tau_phi': 15, 'PRI_lep_pt': 16, 'PRI_lep_eta': 17, 'PRI_lep_phi': 18, 'PRI_met': 19, 'PRI_met_phi': 20, 'PRI_met_sumet': 21, 'PRI_jet_num': 22, 'PRI_jet_all_pt': 28}
Adjusting 27...
Result: {'DER_mass_MMC': 0, 'DER_mass_transverse_met_lep': 1, 'DER_mass_vis': 2, 'DER_pt_h': 3, 'DER_deltar_tau_lep': 7, 'DER_pt_tot': 8, 'DER_sum_pt': 9, 'DER_pt_ratio_lep_tau': 10, 'DER_met_phi_centrality': 11, 'PRI_tau_pt': 13, 'PRI_tau_eta': 14, 'PRI_tau_phi': 15, 'PRI_lep_pt': 16, 'PRI_lep_eta': 17, 'PRI_lep_phi': 18, 'PRI_met': 19, 'PRI_met_phi': 20, 'PRI_met_sumet': 21, 'PRI_jet_num': 22, 'PRI_jet_all_pt': 27}
Adjusting 26...
Result: {'DER_mass_MMC': 0, 'DER_mass_transverse_met_l

In [10]:
sorted(test_df.column_labels.values())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [11]:
# Normalizing dataframes
test_df = test_df.normalize()
train_df = train_df.normalize()

# Replacing NaNs with 0s
test_df = test_df.replace(float('NaN'), 0.0)
train_df = train_df.replace(float('NaN'), 0.0)

TODO: Fix re-indexing. Use accumulating array...

dropping [0, 1, 4] means [1, 2, 2, 2, 3, ...]

#### TODO:

    1. Remove 1/3 of the columns (the ones with the most NaNs)
    2. Fill in the data from the most correlated columns (above a certain coeficient)
    3. Normalize the data (subtract the mean and divide by the stddev)
    4. Fill in the rest with 0s (since that is the mean of the normalized data)