# Parsing data using Numpy

In [1]:
import numpy as np
import csv
import copy

In [2]:
IN_TEST_DATA_PATH = '../data/test.csv'
IN_TRAIN_DATA_PATH = '../data/train.csv'

OUT_TEST_DATA_PATH = '../data/test_fixed.csv'
OUT_TRAIN_DATA_PATH = '../data/train_fixed.csv'

In [3]:
test_csv_reader = csv.reader(open(IN_TEST_DATA_PATH))
train_csv_reader = csv.reader(open(IN_TRAIN_DATA_PATH))

In [4]:
class DataFrame:
    '''
    This class is used as a data-container,
    representeing column-organized information
    read from csv files.
    '''
    column_labels = {}
    data = None
    
    def __init__(self, csv_path):
        
        temp_data = None
        
        with open(csv_path) as csv_file:
            csv_reader =  csv.reader(csv_file)
            n_rows = sum(1 for row in csv_reader)
        
            # Reset reader's head pointer
            csv_file.seek(0)
            
            for row_idx, row in enumerate(csv_reader):
                if (row_idx == 0):
                    
                    # Fill in dictionary with (column_name:column_index)
                    for column_idx, column_label in enumerate(row):
                        self.column_labels[column_label] = column_idx
                    temp_data = [[0 for x in range(n_rows-1)] for y in range(len(self.column_labels))]
                else:
                    
                    # Fill data in a column-oriented fashion
                    for column_idx, column_value in enumerate(row):
                        temp_data[column_idx][row_idx-1] = column_value
        
        # Store all the data into an 'ndarray'
        self.data = np.array(temp_data)
    
    # targets have to be labels, not indices
    def get_columns(self, targets):
        '''
        Returns a copy the desired columns' data as a 
        list of Column objects.
        '''
        columns = []
        
        if all(isinstance(label, str) for label in targets):
            columns = [Column(label, self.data[self.column_labels[label],:]) for label in targets]
            
        return columns
    
    # target_axix=0 is columns, target_axix=1 is rows
    def drop(self, targets, target_axis=0):
        '''
        Returns a new DataFrame without the
        dropped columns/rows.
        '''
        dataframe_clone = self.__clone()
        dataframe_offset = len(targets)
        
        # All elements in the list are indexes
        if all(isinstance(index, int) for index in targets):
            dropable_keys = [key for key in dataframe_clone.column_labels if dataframe_clone.column_labels[key] in targets]
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in dropable_keys], axis=target_axis)
            dataframe_clone.column_labels = {k: v-dataframe_offset for k, v in dataframe_clone.items()}
        
        # All elements in the list are labels
        elif all(isinstance(label, str) for label in targets) and target_axis == 0:
            dataframe_clone.data = np.delete(dataframe_clone.data, [dataframe_clone.column_labels.pop(label) for label in targets], axis=target_axis)
            dataframe_clone.column_labels = {k: v-dataframe_offset for k, v in dataframe_clone.column_labels.items()}
        
        return dataframe_clone
    
    def corr(self):
        '''
        Returns a 2D matrix with the correlation
        coeficients between all of the columns.
        '''
        return np.corrcoef(self.data)
    
    def set_type(self, target_type):
        '''
        Attempts to change the DataFrame's data
        type to a single type (target_type). The
        returned DataFrame is a copy of 'self'.
        '''
        dataframe_clone = self.__clone()
        dataframe_clone.data = dataframe_clone.data.astype(target_type)
        return dataframe_clone
    
    def __clone(self):
        '''
        Creates and returns a clone of the current
        DataFrame object (creating a deep copy of
        all its components).
        '''
        dataframe_clone = copy.deepcopy(self)
        dataframe_clone.column_labels = copy.deepcopy(self.column_labels)
        dataframe_clone.data = copy.deepcopy(self.data)
        return dataframe_clone
    
class Column:
    label = None
    values = None
    
    def __init__(self, label, values):
        self.label = label
        self.values = values

In [5]:
test_df = DataFrame(IN_TEST_DATA_PATH)
train_df = DataFrame(IN_TRAIN_DATA_PATH)

In [6]:
corr_df = test_df.drop(['Id', 'Prediction']).set_type(float).corr()