In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [12]:
def read_analyzed_edge(full_path):
    temp_df = pd.read_csv(full_path)
    temp_df['period_score'] = (1 < temp_df['period']) | (temp_df['period'] < 4)
    temp_df['maxcov_score'] = temp_df['maxcov'] < .2
    temp_df['amp_score'] = (4 < temp_df['amp']) |(temp_df['amp'] < 20)
    temp_df['sum_score'] = temp_df.loc[:,['period_score','maxcov_score','amp_score']].astype(int).sum(axis=1)
    temp_df['total_score'] = temp_df['sum_score'].sum()
    return temp_df

In [13]:
DATA_DIR = 'analyzed_edges/'
files = [file for file in os.listdir(DATA_DIR) if file.startswith('edge_analysis') and file.endswith('.csv')]
df_dict = dict()
for file in files[:10]:
    full_path = os.path.join(DATA_DIR, file)
    date = file.replace('.csv','')
    temp_df = read_analyzed_edge(full_path)
    display(temp_df.head())
    df_dict[date] = temp_df

In [14]:
def read_tf_csv(csv_path='tflabels.csv'):
    df = pd.read_csv(csv_path)
    CROP_DIFS = (125, 844, 153, 454)
    def get_date(x):
        split_x = x.split('_')
        return split_x[1] if split_x[0] == 'spots' else split_x[0]
    
    df['filename'] = df['filename'].apply(get_date)
    df = df.rename(columns={'filename':'date'})
    wide_indices = df['width'] == 1000
    df.loc[wide_indices,['xmin','xmax']] -= CROP_DIFS[0]
#     df.loc[wide_indices,['ymin','ymax']] -= CROP_DIFS[2]
    df = df.drop(columns=['width','height','class','ymin','ymax']).set_index('date')
    df.index = pd.to_datetime(df.index)
    return df

tf_df = read_tf_csv()
display(tf_df)

Unnamed: 0_level_0,xmin,xmax
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-12-15,190,489
2021-07-16,287,629
2021-12-05,272,531
2019-12-23,194,502
2017-01-05,428,711
...,...,...
2017-01-07,195,590
2021-01-16,247,590
2017-11-21,342,647
2017-01-30,136,574


In [15]:
def preprocess_yolo_dir(parent_dir, img_width=719, csv_path='date_labels.csv'):
    full_csv_path = os.path.join(parent_dir, csv_path)
    if os.path.exists(full_csv_path):
        print(f'File {full_csv_path} already found, returning')
        truth_df = pd.read_csv(full_csv_path)
        return truth_df
    
    truth_df_cols = list()
    for file in os.listdir(parent_dir):
        date = file.split('_', 1)[0].replace('.txt','')
        new_name = os.path.join(parent_dir, date + '.txt')
        print(new_name)
        if not os.path.exists(new_name): os.rename(os.path.join(parent_dir, file), new_name)
        with open(new_name, 'r') as f:
            truth_arr = np.zeros(img_width, dtype=np.bool_)
            for line in f.readlines():
                if not line: continue
                print(line)
                arr = np.array([float(x) for x in line.split(' ')])
                assert arr[0] == 0
                arr = arr[::2] # grab only the x values
                xmin = int(np.amin(arr) * img_width)
                xmax = int(np.amax(arr) * img_width)
                truth_arr[xmin:xmax] = True
            truth_df_cols.append(pd.DataFrame(truth_arr, columns=date))
    truth_df = pd.concatenate(truth_df_cols, axis=1)
    truth_df.to_csv(full_csv_path)
    print(f'File {full_csv_path} created, returning')
    return truth_df

def preprocess_tf_csv(src_path='tflabels.csv', img_width=719, dest_path='date_labels.csv'):
    if os.path.exists(dest_path):
        print(f'File {dest_path} already found, returning')
        truth_df = pd.read_csv(dest_path)
        return truth_df
    tf_df = read_tf_csv(csv_path=src_path)
    truth_df_cols = list()
    for i, tup in tf_df.itertuples():
        date = tup.date
        truth_arr = np.zeros(img_width, dtype=np.bool_)
        xmin = tup.xmin
        xmax = tup.xmax
        truth_arr[xmin:xmax] = True
        truth_df_cols.append(pd.Series(truth_arr, index=date))
    truth_df = pd.concatenate(truth_df_cols, axis=1)
    truth_df.to_csv(full_csv_path)
    print(f'File {dest_path} created, returning')
    return truth_df

In [16]:
def window_label(arr, window_size=240, offset=10):
    labels = list()
    for i in range(0, len(arr) - window_size, offset):
        if np.count_nonzero(arr[i,i+window_size]) <= 0:
            labels.append(True)
        else:
            labels.append(False)
    return np.array(labels)

def concat_yolo_truth_values(truth_dir='sample_yolo_labels/', data_dir='analyzed_edges/', target_col='lstid_truth'):
    truth_df = preprocess_yolo_dir(truth_dir)
    files = [file for file in os.listdir(data_dir) if file.startswith('edge_analysis') and file.endswith('.csv')]
    df_list = list()
    for file in files[:10]:
        full_path = os.path.join(data_dir, file)
        date = file.replace('.csv','')
        temp_df = read_analyzed_edge(full_path)
        temp_df['date'] = date
        temp_df[target_col] = window_label(truth_df[date])
        temp_df.set_index('date', inplace=True)
        temp_df.index = pd.to_datetime(temp_df.index)
        df_list.append(temp_df)
    Xy = pd.concat(df_list, axis=0)
    return Xy.loc[:,Xy.columns != target_col], Xy.loc[:,Xy.columns == target_col]

def reduce_input_dim(Xy, target_col='lstid_truth'):
    row = dict()
    row['maxcov_per'] = np.count_nonzero(Xy['maxcov'] < .2)
    row['maxcov_avg'] = Xy['maxcov'].loc[Xy['maxcov'] < 1].mean()
    
    row['amp_per'] = np.count_nonzero(X['amp'] < 20)
    row['amp_avg'] = Xy['amp'].loc[Xy['amp'] < 100].mean()
    
    row['period_per'] = np.count_nonzero(Xy['period'] > 3)
    row['period_avg'] = Xy['period'].loc[Xy['period'] < 20].mean()
    
    row[target_col] = Xy[target_col].sum()
    Xy = pd.DataFrame(row)
    return Xy

def concat_tf_truth_values(csv_path='', data_dir='analyzed_edges/', target_col='lstid_truth', dim=2):
    truth_df = preprocess_tf_csv(src_path=csv_path)
    files = [file for file in os.listdir(data_dir) if file.startswith('edge_analysis') and file.endswith('.csv')]
    df_list = list()
    for file in files[:10]:
        full_path = os.path.join(data_dir, file)
        date = file.replace('.csv','')
        temp_df = read_analyzed_edge(full_path)
        temp_df['date'] = date
        temp_df[target_col] = window_label(truth_df[date])
        temp_df.set_index('date', inplace=True)
        temp_df.index = pd.to_datetime(temp_df.index)
        if dim == 1:
            df_list.append(reduce_input_dim(temp_df, target_col=target_col))
        elif dim==2:
            df_list.append(temp_df)
    Xy = pd.concat(df_list, axis=0)
    return Xy.loc[:,Xy.columns != target_col], Xy.loc[:,Xy.columns == target_col]

X, y = concat_tf_truth_values()
print(X.shape)
print(y.shape)

ValueError: No objects to concatenate

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_tree, plot_tree
from sklearn.ensemble import VotingClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, average_precision, roc_auc
from sklearn.model_selection import train_test_split
from statsmodels.nonparametric.smoothers_lowess import lowess

In [None]:
def split_min_cols(X):
    X = X.loc[:,['period','maxcov','amp']]
    return X
    
def split_core_cols(X):
    X = X.drop(columns=['period_score','maxcov_score','amp_score','sum_score','total_score'])
    return X

def split_all_cols(X):
    X = X.drop(columns=['sum_score','total_score'])
    return X


    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.2)
# model = LGBMClassifier()
scores = list()
for max_depth in range(1, 15):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=X_train.shape[1])
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    print('ROC', roc_auc(y_pred, y_test))
    plt.title(max_depth)
    plt.show()
    scores.append((max_depth, score))

x, y = zip(*scores)
sns.lineplot(x=x, y=y, palette='viridis')
plt.xlabel('Max Tree Depth')
plt.ylabel('Accuracy Score')
plt.show()

In [None]:
def print_decision_tree(model, save_path=''):
    text_repr = tree.export_text(model)
    print(text_repr)
    if save_path:
        with open(save_path, 'w') as f:
            f.write(text_repr)
    return text_repr

def plot_decision_tree(model, save_path='', dpi=300):
    plt.figure(figsize=(16,16), dpi=dpi)
    plot_tree(
        model, 
        feature_names=model.feature_names_in_, 
        class_names=['No LSTID','LSTID'], 
        filled=True
    )
    if save_path:
        plt.savefig(save_path, dpi=dpi)
    plt.show()
    return

def plot_importance(model, save_path='', dpi=dpi):
    plt.figure(figsize=(16,16), dpi=dpi)
    data = pd.DataFrame({
        'features' : model.feature_names_in_
        'importance' : model.feature_importances_,
    }).set_index('features')
    sns.barplot(data=data, y='importance', hue='importance', palette='flare')
    if save_path:
        plt.savefig(save_path, dpi=dpi)
    plt.show()
    return

print_decision_tree(model, save_path='dt_text.txt')
plot_decision_tree(model, save_path='dt_info.png')
plot_importance(model, save_path='dt_fimp.png')

# Decision Tree Information
[Decision Tree Function](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)  
[Decision Tree Documentation](https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning)  
[Decision Tree Visualization Walkthrough](https://mljar.com/blog/visualize-decision-tree/)  
[TF Decision Forests](https://github.com/tensorflow/decision-forests)  
[Light Gradient Boosting Machine](https://lightgbm.readthedocs.io/en/v3.3.2/index.html)  
[Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)  
[Detailed Tree Visualization](https://github.com/parrt/dtreeviz)  

Corrections:
- DTs can handle multi-output, RF can with some libraries, gradient boosters cannot
- Confidence can easily be shown
    
Feature Sets:


Optimization Problem:
- Window-by-window:
    - Predict a single value for every window
    - Requires label for every window
    - More data available
    - More outliers by default
- Sequence-by-sequence:
    - Predict a single value for an entire set 
    - Requires feature summarization
    - Less data available
    - Better feature control
- Regression:
    - Determine value between 0 and 1
- Classification:
    - Determine class either 0 or 1