In [None]:
"""
Course: dpat4
Lab: 04
Part: 2
"""

import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from contextlib import contextmanager
from scipy.stats import pearsonr, spearmanr

TIMER: dict = {}


def record(lib: str, task: str, time_used: float) -> None:
    """
    Record the elapsed time of the task using specific lib
    :param lib: the name of the lib used (numpy, pandas)
    :param task: task number (task1)
    :param time_used: time used to complete the task
    :return: None
    """
    global TIMER
    
    if lib not in TIMER:
        TIMER[lib] = {}
    TIMER[lib][task] = time_used


@contextmanager
def timer(lib: str, task: str):
    start = time.time()
    try:
        yield
    finally:
        end = time.time()
        time_used = round(end - start, 6)
        record(lib, task, time_used)
        
        print(f'task: {task:}\nlib: {lib}\ntime used: {time_used}')


URL = "https://archive.ics.uciedu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

print('Setup complete...')

## Task No. 0
Desc: Імпортування дата сету

In [None]:
# task: 0
# lib: pandas

headers = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]

with timer('pandas', 'task0'):
    pd_data = pd.read_csv(URL, sep=',', na_values='?', names=headers)
    pd_data.dropna(inplace=True)

pd_data.head()

In [None]:
# task: 0
# lib: numpy

with timer('numpy', 'task0'):
    
    np_data = np.genfromtxt(
        URL, 
        delimiter=',',
        missing_values='?',
        filling_values=np.nan,
        dtype=None,
        encoding=None,
        names=headers
    )
    
    mask = np.all([~np.isnan(np_data[col]) for col in headers], axis=0)
    
    np_data = np_data[mask]

np_data

In [None]:
# test for NaN values

nan_mask = np.any([np.isnan(np_data[col]) for col in headers], axis=0)
nan = np_data[nan_mask]

nan

## Task No. 1
Нормалізувати датасет

In [None]:
def normalise(data):
    return (data - np.min(data, axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0))

# task: 1
# lib: pandas

with timer('pandas', 'task1'):
    
    pd_data = pd_data.astype(float)
    
    normalise_cols = ['trestbps', 'chol', 'thalach', 'oldpeak']
    
    pd_normalised = pd_data.copy()
    pd_normalised[normalise_cols] = normalise(pd_data[normalise_cols])
    
pd_normalised

In [None]:
# task: 1
# lib: numpy

with timer('numpy', 'task1'):
    
    np_normalised = np_data.copy()
    
    for col in normalise_cols:
        np_normalised[col] = normalise(np_data[col])
    
np_normalised

## Task No. 2
Збудувати гістограму по одному із атрибутів, що буде показувати на кількість елементів, що знаходяться у 10 діапазонах, які ви задасте.

In [None]:
# task: 2
# lib: pandas

with timer('pandas', 'task2'):
    pd_data['age'].plot(kind='hist', bins=10, edgecolor='black')
    
    plt.title('Histogram of Age')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# task: 2
# lib: numpy

with timer('numpy', 'task2'):
    plt.hist(np_normalised['sex'], bins=2, edgecolor='black')
    
    plt.xticks([0,1], ['Female','Male'])
    plt.title('Histogram of Sex')
    plt.xlabel('Sex')
    plt.ylabel('Frequency')
    plt.show()

## Task No. 3
Збудувати графік залежності одного integer/real атрибута від іншого.

In [None]:
# task: 3
# lib: pandas

with timer('pandas', 'task3'):
    pd_data.plot(kind='scatter', x='age', y='chol', alpha=0.5)
    
    plt.title('Age vs Cholesterol')
    plt.xlabel('Age')
    plt.ylabel('Cholesterol')
    plt.show()

In [None]:
# task: 3
# lib: numpy

with timer('numpy', 'task3'):
    plt.scatter(data=np_data, x='age', y='chol', alpha=0.5)
    
    plt.title('Age vs Cholesterol')
    plt.xlabel('Age')
    plt.ylabel('Cholesterol')
    plt.show()

## Task No. 4
Підрахувати коефіцієнт Пірсона та Спірмена для двох integer/real атрибутів.

In [None]:
# task: 4
# lib: pandas

def get_correlation(dataframe, cols: tuple) -> pd.DataFrame:
    
    col1, col2 = cols
    
    pearson, _ = pearsonr(dataframe[col1], dataframe[col2])
    spearman, _ = spearmanr(dataframe[col1], dataframe[col2])
    
    res = {
        f'{col1} / {col2}': {
            'pearson': pearson,
            'spearman': spearman
        }
    }
    
    return pd.DataFrame(res)
    
    
with timer('pandas', 'task4'):
    
    pd_correlation = get_correlation(pd_data, ('age', 'chol'))

pd_correlation

In [None]:
# task: 4
# lib: numpy

with timer('numpy', 'task4'):
    
    np_correlation = get_correlation(np_data, ('fbs', 'restecg'))

np_correlation

## Task No. 5
Провести One Hot Encoding категоріального string атрибуту.

In [None]:
# task: 5
# lib: pandas

with timer('pandas', 'task5'):
    
    pd_oc = pd.get_dummies(pd_data['sex'], prefix='sex')

pd_oc

In [None]:
# task: 5
# lib: numpy

with timer('numpy', 'task5'):
    
   cp = np_data['sex'].astype(int)
   np_oc = np.eye(np.max(cp) + 1)[cp]

np_oc

## Task No. 6
Провести візуалізацію багатовимірних даних

In [None]:
cols = ['age', 'thalach', 'ca', 'num']
pp = sns.pairplot(pd_data[cols], hue='age', height=1.8, aspect=1.8,
              plot_kws=dict(edgecolor="black", linewidth=0.5))

fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Age, Thalach, Ca, and Num Pairwise Plots', fontsize=14)


In [None]:
plt.figure(figsize=(8,6))
for lib_name, tasks in TIMER.items():
    plt.plot(tasks.keys(), tasks.values(), marker='x', label=lib_name)

plt.ylabel('Time, s')
plt.title(f"Elapsed Time per Task")
plt.xlabel("Task")
plt.legend()
plt.grid(True)
plt.tight_layout()