In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext dotenv
%dotenv

In [None]:
import os
import math
from typing import List, Tuple

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import pygwalker as pyg

## environment variables

In [None]:
DEFAULT_INPUT_FILE = 'example_data.csv'
MIN_COMP_RATIO = 0.05

In [None]:
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR')
assert INPUT_DATA_DIR is not None
assert OUTPUT_DATA_DIR is not None

## functions

In [None]:
def load_data(file_name: str=DEFAULT_INPUT_FILE):
    f = os.path.join(INPUT_DATA_DIR, file_name)
    return pd.read_csv(f)

In [None]:
def round_nice(number: float, n_digits=1) -> float:
    exponent = math.floor(math.log10(number))
    base = 10 ** exponent
    factor = number / base

    # 1, 2, 5のどれに近いかを判定
    if factor < 1.5:
        return 1 * base
    elif factor < 3.5:
        return 2 * base
    else:
        return 5 * base

In [None]:
def calculate_thresholds(df: pd.DataFrame, col: str, nice_round: bool) -> List[float]:
    n = df[col].count()
    k = int(1 + math.log2(n))  # Sturges' formula

    unique_values = df[col].unique().tolist()
    if len(unique_values) <= 1:
        return []
    if len(unique_values) <= k:
        return sorted(unique_values)

    print(unique_values)

    print(df[col].describe())


    bins = pd.qcut(df[col].rank(method='first'), k, duplicates='drop')
    cut_points = [bins.cat.categories[i].right for i in range(len(bins.cat.categories) - 1)]    # eliminate max value

    print(cut_points)
    if nice_round:
        # 有効数字1桁の1, 2, 5倍数に丸める
        cut_points = sorted(list(set([round_nice(x) for x in cut_points])))

    print(k)
    print(cut_points)

    return cut_points

In [None]:
def merge_records(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Merge adjacent records with adjacent values in col and the same predicted result
    """
    cut_points = calculate_thresholds(df, col, nice_round=True)

    return df

In [None]:
# for development
df = load_data()
col = df.columns[-1]
print(col)
df[col].hist(bins=20)

df = merge_records(df, col)

## main

In [None]:
# main
df = load_data()

In [None]:
# sns.pairplot(df)

In [None]:
# put index to Details field
# pyg.walk(df.reset_index())

In [None]:
target_col = df.columns[0]
feature_cols = df.drop(target_col, axis=1).columns.tolist()
min_samples = math.ceil(len(df) * MIN_COMP_RATIO)

# featureから2つ選んでリストにする
feature_col_pairs = [[feature_cols[i], feature_cols[j]] for i in range(len(feature_cols)) for j in range(i+1, len(feature_cols))]

In [None]:
for feature_col_pair in feature_col_pairs[:2]:
    X = df[feature_col_pair]
    y = df[target_col]

    model = DecisionTreeRegressor(min_samples_leaf=min_samples, min_impurity_decrease=0)
    model.fit(X, y)
    y_pred = model.predict(X)

    df_pred = df[feature_col_pair].copy()
    df_pred[target_col] = y
    df_pred[f'{target_col}_pred'] = y_pred

    # assessment
    accuracy = model.score(X, y)
    print(accuracy)

    df_pred.plot.scatter(x=feature_col_pair[0], y=feature_col_pair[1], c=f'{target_col}_pred', colormap='viridis')