In [1]:
import csv
from enum import Enum
import numpy as np

In [2]:
class FeatureType(Enum):
    Numerical = 1
    Category = 2

In [3]:
file_path = "data/heart.csv"

## Basic data ma

In [4]:
def get_default_descriptor():
    default_descriptor = [FeatureType.Numerical for _ in range(14)]
    default_descriptor[1] = FeatureType.Category # Sex
    default_descriptor[2] = FeatureType.Category # Chest pain type
    default_descriptor[5] = FeatureType.Category # fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
    default_descriptor[8] = FeatureType.Category # exercise induced angina (1 = yes; 0 = no)
    return default_descriptor

def read_raw_data(file_path):
    with open(file_path, newline='' ) as file:
        reader = csv.reader(file, delimiter=',')
        names = next(reader)
        raw_data = list(reader)
    return raw_data, names

def get_possible_feature_values(data, index):
    possible_values = []
    for row in data:
        for i, feature in enumerate(row):
            if i == index:
                possible_values.append(feature)
    return list(set(possible_values))

def find_among_possible(possible_values, value):
    return possible_values.index(value)

def find_possible_values_for_all(data, descriptor):
    possible_values = []
    for i, feature_type in enumerate(descriptor):
        if feature_type == FeatureType.Category:
            possible_values.append(get_possible_feature_values(data, i))
        else:
            possible_values.append(None)
    return possible_values

def add_category_to_data_point(data_point, feature, possible_values):
    index = find_among_possible(possible_values, feature)
    result = [0 for _ in possible_values]
    result[index] = 1
    return data_point + result

def get_data(file_path, descriptor):
    raw_data, _ = read_raw_data(file_path)
    possible_values = find_possible_values_for_all(raw_data, descriptor)
    result = []
    output = []
    for row in raw_data:
        data_point = []
        for i, (feature, feature_type) in enumerate(zip(row, descriptor)):
            if feature_type == FeatureType.Numerical:
                data_point.append(float(feature))
            elif feature_type == FeatureType.Category:
                data_point = add_category_to_data_point(data_point, feature, possible_values[i])
        out = data_point.pop()
        output.append(out)
        result.append(tuple(data_point))
    return np.array(result), np.array(output)

In [5]:
data, out = get_data(file_path, get_default_descriptor())
print(data.shape)
print(out.shape)
for row in data:
    print(row)


(303, 19)
(303,)
[ 63.    0.    1.    0.    0.    0.    1.  145.  233.    0.    1.    0.
 150.    1.    0.    2.3   0.    0.    1. ]
[ 37.    0.    1.    0.    0.    1.    0.  130.  250.    1.    0.    1.
 187.    1.    0.    3.5   0.    0.    2. ]
[ 41.    1.    0.    0.    1.    0.    0.  130.  204.    1.    0.    0.
 172.    1.    0.    1.4   2.    0.    2. ]
[ 56.    0.    1.    0.    1.    0.    0.  120.  236.    1.    0.    1.
 178.    1.    0.    0.8   2.    0.    2. ]
[ 57.    1.    0.    1.    0.    0.    0.  120.  354.    1.    0.    1.
 163.    0.    1.    0.6   2.    0.    2. ]
[ 57.    0.    1.    1.    0.    0.    0.  140.  192.    1.    0.    1.
 148.    1.    0.    0.4   1.    0.    1. ]
[ 56.    1.    0.    0.    1.    0.    0.  140.  294.    1.    0.    0.
 153.    1.    0.    1.3   1.    0.    2. ]
[ 44.   0.   1.   0.   1.   0.   0. 120. 263.   1.   0.   1. 173.   1.
   0.   0.   2.   0.   3.]
[ 52.    0.    1.    0.    0.    1.    0.  172.  199.    0.    1.    1.
 