In [145]:
import numpy as np
import pandas as pd

class DataAnalysis(object):
    def __init__(self, path):
        data_raw = pd.read_csv(path).drop("Index", axis = 1)
        self.data = data_raw.copy()
        self.data = self.data.loc[:, self.data.dtypes == "float64"]
        self.header = self.data.columns
        self.data = np.array(self.data.T)

    def compute_for_columns(self, function):
        row = []
        for col in self.data:
            row.append(function(col))
        return(row)

    def col_count(self, column):
        count = 0
        for i in column:
            if not np.isnan(i):
                count += 1
        return(count)

    def col_mean(self, column):
        col_sum = 0
        for i in column:
            if not np.isnan(i):
                col_sum += i
        return(col_sum/self.col_count(column))

    def col_std(self, column):
        mean = self.col_mean(column)
        return(np.sqrt(self.col_mean(np.square(column-mean))))

    def col_filter_nan(self, column):
        new_col = []
        for i in column:
            if not np.isnan(i):
                new_col.append(i)
        return(new_col)

    def col_sort(self, column):
        sorted_col = self.col_filter_nan(column)
        for i in range(1, len(sorted_col)):
            j = i-1
            nxt_element = sorted_col[i]
    # Compare the current element with next one
            while (sorted_col[j] > nxt_element) and (j >= 0):
                sorted_col[j+1] = sorted_col[j]
                j=j-1
            sorted_col[j+1] = nxt_element
        return(sorted_col)

    def col_min(self, column):
        return(self.col_sort(column)[0])

    def col_max(self, column):
        return(self.col_sort(column)[-1])

    def col_quantile(self, column, fraction):
        n = self.col_count(column)
        m = fraction*n
        sorted_col = self.col_sort(column)
        if np.floor(m) == m:
            return(sorted_col[int(m)])
        else:
            return((sorted_col[int(np.floor(m))] + sorted_col[int(np.ceil(m))])/2)

    def col_quantile_25(self, column):
        return(self.col_quantile(column,0.25))

    def col_median(self, column):
        return(self.col_quantile(column,0.5))

    def col_quantile_75(self, column):
        return(self.col_quantile(column,0.75))

    def describe_42(self):
        data_description = [self.compute_for_columns(self.col_count),
                            self.compute_for_columns(self.col_mean),
                           self.compute_for_columns(self.col_std),
                           self.compute_for_columns(self.col_min),
                           self.compute_for_columns(self.col_quantile_25),
                           self.compute_for_columns(self.col_median),
                           self.compute_for_columns(self.col_quantile_75),
                           self.compute_for_columns(self.col_max)]

        return pd.DataFrame(data_description,
                            index = ["count","mean","std","min","25","median","75", "max"], 
                            columns = self.header)


In [146]:
DataAnalysis('../data/dataset_train.csv').describe_42()

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
mean,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
std,16674.479577,520.13233,5.218016,5.211132,4.15397,486.189433,106.251202,4.424353,44.111025,3.146852,0.971146,8.780895,97.601087
min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25,38513.0,-489.493777,-4.308182,-5.251511,3.1,-577.577682,397.526978,2.221199,1026.248273,3.648671,-0.670996,-250.64727,-41.84
median,49018.0,261.644731,3.47766,-2.58113,4.6245,-418.912644,464.122952,4.378897,1045.533335,5.877582,-0.043296,-244.86751,-2.51
75,60873.0,525.90954,5.422106,4.90575,5.6675,256.886191,597.598097,5.830729,1058.465401,8.25155,0.594446,-232.53675,50.89
max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1098.958201,13.536762,3.056546,-225.42814,279.07
