## Import needed library

In [308]:
import pandas as pd
from pandas import DataFrame, Series
import os
from typing import Any

## Defining constants

In [309]:
DATASET = "heart.csv"
TARGET_COLUMN = "target" # in homework target column is target but can be changed on something else

## Open dataset

In [310]:
df = pd.read_csv(os.path.join(os.getcwd(), "dataset", DATASET))

## Creating meaning calculation function

In [311]:
def calculate_mean(d: DataFrame, target: str) -> int:
    """
    Calculate average value of target column
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: average value
    """

    if target in d.columns:
        return d[target].mean()

    raise Exception(f"{target} is not present in dataframe")

## Creating median calculation function

In [312]:
def calculate_median(d: DataFrame, target: str) -> float:
    """
    Calculate mid value in target column
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: mid value
    """

    if target in d.columns:
        return d[target].median()

    raise Exception(f"{target} is not present in dataframe")

## Calculate the most common value (mode)

In [313]:
def calculate_mode(d: DataFrame, target: str) -> Series:
    """
    Calculate most common value in target column
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: most common value
    """

    if target in d.columns:
        return d[target].mode()

    raise Exception(f"{target} is not present in dataframe")

## Calculate the standard deviation

In [314]:
def calculate_std(d: DataFrame, target: str) -> float:
    """
    Calculate the standard deviation of provided column
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: standard deviation
    """

    if target in d.columns:
        return d[target].std()

    raise Exception(f"{target} is not present in dataframe")

## Calculate column variation

In [315]:
def calculate_var(d: DataFrame, target: str) -> float:
    """
    Calculate column variation in a provided dataframe
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: variation
    """

    if target in d.columns:
        return d[target].var()

    raise Exception(f"{target} is not present in dataframe")

## Get a range of values

In [316]:
def get_values_range(d: DataFrame, target: str, r: (float, float), inclusive: str) -> list[Any]:
    """
    Get all values between provided range
    :param d: pandas dataframe
    :param target: column from dataframe
    :param r: range of min, max value to select values
    :param inclusive: to include min, max value or not ("both", "neither", "left", "right")
    :return: a new list with values from min to max
    """

    if target in d.columns:
        if r[0] > r[1]:
            raise Exception("invalid range values")

        values = d[target].between(r[0], r[1], inclusive=inclusive)
        return [d[target][i] for i in range(len(values)) if values[i] == True]

    raise Exception(f"{target} is not present in dataframe")

## Get the highest correlation from target column and another column from df

In [317]:
def get_highest_corr(d: DataFrame, target: str) -> float:
    """
    Get the highest correlation from target column and another column from df
    :param d: pandas dataframe
    :param target: column from dataframe
    :return: highest corr with another column
    """

    if target in d.columns:
        # Drop row with target name, because its always 1
        return (d.corr()[target].drop(target, axis=0)).max()

    raise Exception(f"{target} is not present in dataframe")

## Putting all together

In [318]:
try:
    print("=========================")
    print(f"Target: {TARGET_COLUMN}")
    print(f"Mean: {calculate_mean(df, TARGET_COLUMN)}")
    print(f"Median: {calculate_median(df, TARGET_COLUMN)}")
    print(f"Mode: {list(calculate_mode(df, TARGET_COLUMN))}")
    print(f"Variance: {calculate_var(df, TARGET_COLUMN)}")
    print(f"Standard deviation: {calculate_std(df, TARGET_COLUMN)}")

    # output is an array of 0 because all values from current target column are 1 and 0
    # so 0 is only between -0.5 and 0.5
    print(f"Get range between -0.5 and 0.5: {list(get_values_range(df, TARGET_COLUMN, (-0.5, 0.5), 'both'))}")

    print(f"Get highest correlation: {get_highest_corr(df, TARGET_COLUMN)}")
    print("=========================")

except Exception as e:
    print("something went wrong: ", e)

Target: target
Mean: 0.5445544554455446
Median: 1.0
Mode: [1]
Variance: 0.24883614189233488
Standard deviation: 0.4988347841643913
Get range between -0.5 and 0.5: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Get highest correlation: 0.43379826150689443
