# Solutions

## Cleaning the data

In [115]:
from typing import Optional, Union

import pandas as pd
import numpy as np

In [2]:
def remove_columns(df: pd.DataFrame):
    return df.dropna(axis=1)

def remove_rows(df: pd.DataFrame):
    return df.dropna(axis=0)

def use_custom_value(df: pd.DataFrame, fill_value: Union[int, str]):
    return df.fillna(value=fill_value)

def use_next_value(df: pd.DataFrame):
    return df.fillna(method="bfill")

def use_prev_value(df: pd.DataFrame):
    return df = df.fillna(method="ffill")
def clean_data(df: pd.DataFrame, method: str, fill_value: Optional[Union[str, int, float]] = None):
    if method == "remove_columns":
        df = df.dropna(axis=1)
    elif method == "remove_rows":
        df = df.dropna(axis=0)
    elif method == "user_value":
        assert fill_value != None, "Provide value for replacement!"
        df = df.fillna(value=fill_value)
    elif method == "next":
        df = df.fillna(method="bfill")
    elif method == "previous":
        df = df.fillna(method="ffill")
    else:     
        raise NotImplementedError

## Splitting the data

In [18]:
def split_data(
    df: pd.DataFrame,
    test_split: float = 0.1,
    target_variable: Optional[Union[int, str]] = None,
):
    assert test_split < 1
    train_split = 1 - test_split
    df_sample = df.sample(frac=1, random_state=42)
    if target_variable is not None:
        grouped_df = df_sample.groupby(target_variable)
        arr_list = [np.split(g, [int(train_split * len(g))]) for i, g in grouped_df]
        df_train = pd.concat([t[0] for t in arr_list])
        df_test = pd.concat([v[1] for v in arr_list])
    else:
        indices = [int(train_split * len(df))]
        df_train, df_test = np.split(df_sample, indices)

    return df_train, df_test

In [106]:
def split_time_series(
    df: pd.DataFrame,
    target_variable: str,
    test_split: float = 0.1,
):
    assert test_split < 1
    train_split = 1 - test_split
    train_size = int(np.ceil(train_split * len(df)))
    df = df.sort_values(by=target_variable)
    df_train = df.iloc[0:train_size, :]
    df_test = df.iloc[train_size:, :]
    return df_train, df_test

## Scaling the data

In [73]:
eps = 1e-8


def min_max_scaling(x: np.ndarray):
    return (x - x.min()) / (np.ptp(x))


def max_abs_scaling(x: np.ndarray):
    return x / np.max(x, axis=0)


def standard_scaling(x: np.ndarray):
    return (x - x.mean(axis=0)) / (x.std(axis=0) + eps)


def unit_vector_scaling(x: np.ndarray):
    return (x.transpose() / np.linalg.norm(x, ord=2, axis=1)).transpose()

## Calculating metrics

In [27]:
def generate_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray):
    assert len(y_pred) == len(y_true)
    num_classes = len(np.unique(y_true))
    confusion_matrix = np.zeros((num_classes, num_classes))
    for i in range(len(y_true)):
        confusion_matrix[y_pred[i], y_true[i]] += 1
    return confusion_matrix

In [148]:
def precision(y_true: np.ndarray, y_pred: np.ndarray, average: str = "micro"):
    precision = 0
    confusion_matrix = generate_confusion_matrix(y_true, y_pred)
    if average == "macro":
        score = 0
        for i in range(len(confusion_matrix)):
            score += confusion_matrix[i, i] / np.sum(confusion_matrix[i, :])
        precision = score / len(confusion_matrix)
    elif average == "micro":
        TP = np.trace(confusion_matrix)
        precision = TP / np.sum(confusion_matrix)
    return precision


def recall(y_true: np.ndarray, y_pred: np.ndarray, average: str = "micro"):
    precision = 0
    confusion_matrix = generate_confusion_matrix(y_true, y_pred)
    if average == "macro":
        score = 0
        for i in range(len(confusion_matrix)):
            score += confusion_matrix[i, i] / np.sum(confusion_matrix[:, i])
        precision = score / len(confusion_matrix)
    elif average == "micro":
        TP = np.trace(confusion_matrix)
        precision = TP / np.sum(confusion_matrix)
    return precision

In [112]:
def MSE(y_true: np.ndarray, y_pred: np.ndarray):
    return np.square(np.subtract(y_true, y_pred)).mean()


def RMSE(y_true: np.ndarray, y_pred: np.ndarray):
    return np.sqrt(MSE(y_true, y_pred))


def MAE(y_true: np.ndarray, y_pred: np.ndarray):
    return np.abs(np.subtract(y_true, y_pred)).mean()