In [1]:
import sys
sys.path.insert(0, "../")

In [2]:
from data_loading import ChatHighlightData
import numpy as np
from datetime import datetime
import json

from utils import moving_avg

from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from baselines.ScipyPeaks import ScipyPeaks
from baselines.RealTimePeakPredictor import RealTimePeakPredictor

## Parameters for Preprocessing

### Chat measures
```['message_density', 'average_message_lengths_chars', 'message_diversity', 'emote_density', 'copypasta_density']```
### Window size
Number of frames to consider around the current frame
### Scale / Step ```k```
Which frames to consider: each ```k```th frame

This one has an influence for both, x and y

In [3]:
def load_data(mode, ch_dir, hl_dir, em_dir, param_grid):
    if mode == "small":
        file_identifier = "nalcs_w1*_g1"
    if mode == "train":
        file_identifier = "nalcs_w[134579]*_g[13]"
    if mode == "test":
        file_identifier = "nalcs_w[268]*_g[13]"

    chd = ChatHighlightData(chat_dir=ch_dir, highlight_dir=hl_dir, emote_dir=em_dir, frame_rate=30)
    chd.load_data(file_identifier=file_identifier)
    chd.load_emotes()

    pg = list(ParameterGrid(param_grid))

    for i, params in enumerate(pg):
        chd.set_window_step(window=params["window"], step=params["step"])

        x_data = chd.get_chat_measure(params["measure"])
        y_data = chd.get_highlight_data()

        x = np.empty(0)
        y = np.empty(0)
        for m in x_data.keys():
            x = np.concatenate([x, x_data[m]])
            y = np.concatenate([y, y_data[m]])
        yield i, x, y, params

In [4]:
def param_search(prep_param_grid, eval_params, model, data_loader):
    best_scores_params = list()

    print(f"{datetime.now().strftime('%Y%m%d_%H_%M_%S')}")

    for i,x,y,p in data_loader:
        pl = Pipeline([("avg", FunctionTransformer(moving_avg)),
                       ("scaler", MinMaxScaler()),
                       ("clf", model())
                     ])
        
        gs = GridSearchCV(pl, eval_params, cv=5, n_jobs=4, scoring=["f1"], refit="f1", verbose=1)
        gs.fit(x.reshape((-1,1)),y)

        best_scores_params.append({
            "best_params": gs.best_params_,
            "best_score": gs.best_score_,
            "prep_params": p
        })
        if i%30 == 0:
            print(f"{datetime.now().strftime('%Y%m%d_%H_%M_%S')}: evaluated {i} configurations")
            with open(f"../data/analysis/baselines/grid_search/GridSearchCV_{type(model()).__name__}_{datetime.now().strftime('%Y%m%d_%H_%M_%S')}_PART_{i}.json", "w") as out_file:
                json.dump(best_scores_params, out_file)

    print(f"{datetime.now().strftime('%Y%m%d_%H_%M_%S')}")
    with open(f"../data/analysis/baselines/grid_search/GridSearchCV_{type(model()).__name__}_{datetime.now().strftime('%Y%m%d_%H_%M_%S')}.json", "w") as out_file:
        json.dump(best_scores_params, out_file)

In [24]:
prep_param_grid = {
        "measure": ["emote_density", "copypasta_density"],
        "window": list(range(50,201,25)),
        "step": list(range(20,101,20))
    }

In [25]:
eval_params_SPP = {
        "avg__kw_args": [
            {"N": 5},
            {"N": 50},
            {"N": 500},
        ],
        "clf__prominence": [0.5, 0.55, 0.6, 0.65, 0.7],
        "clf__width": [[5,2000]],
        "clf__rel_height": [0.4, 0.5, 0.6],
        "clf__shift": [0.25, 0.3, 0.35]
      }

In [26]:
eval_params_RTPP = {
    "avg__kw_args": [
        {"N": 5},
        {"N": 50},
        {"N": 500},
    ],
    "clf__lag": [25, 30, 35],
    "clf__threshold": [1,2],
    "clf__influence": [0.7]
  }

In [27]:
model_list = [ScipyPeaks, RealTimePeakPredictor]

In [30]:
model = ScipyPeaks
mode = "train"
dat = iter(load_data(mode, ch_dir="../data/final_data/", hl_dir="../data/gt/", em_dir="../data/emotes/", param_grid=prep_param_grid))

In [31]:
for i,x,y,p in dat:
    if x.dtype != np.float64:
        print(f"wrong datatype {x.dtype} at {i} with parameters\n{p}")
    if np.any(np.isinf(x)) or np.any(np.isnan(x)):
        print(f"problem with x at {i} with parameters\n{p}")
        print(x)
    if np.any(np.isinf(y)) or np.any(np.isnan(y)):
        print(f"problem with y at {i} with parameters\n{p}")
        print(y)     
              
    for N in [5, 50, 500]:
        x_avg = moving_avg(x, N=5)
        if x_avg.dtype != np.float64:
            print(f"wrong datatype {x_avg.dtype} at {i} with parameters\n{p} and moving average of {N}")
        if np.any(np.isinf(x_avg)) or np.any(np.isnan(x_avg)):
            print(f"problem with x_avg at {i} with parameters\n{p} and moving average of {N}")
            print(x_avg)
        x_avg_scld = MinMaxScaler().fit_transform(x_avg.reshape((-1,1)))
        if x_avg_scld.dtype != np.float64:
            print(f"wrong datatype {x_avg_scld.dtype} at {i} with parameters\n{p} and moving average of {N}")
        if np.any(np.isinf(x_avg)) or np.any(np.isnan(x_avg)):
            print(f"problem with x_avg_scld at {i} with parameters\n{p} and moving average of {N} and MinMaxScaling")
            print(x_avg_scld)

In [13]:
# param_search(prep_param_grid, eval_params_SPP, model=model, data_loader=dat)
# print(f"{datetime.now().strftime('%Y%m%d_%H_%M_%S')}")

20220330_12_34_21
Fitting 5 folds for each of 135 candidates, totalling 675 fits
20220330_12_34_40: evaluated 0 configurations
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 

In [None]:
best_scores_params

In [6]:
i,x,y,p = next(iter(load_data("train")))

In [7]:
moving_avg(x.reshape((-1,1))).shape

(361111, 1)

In [11]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [38]:
file_identifier = "nalcs_*"
chd = ChatHighlightData(chat_dir="../data/final_data/", highlight_dir="../data/gt/", emote_dir="../data/emotes/", frame_rate=30)
chd.load_data(file_identifier=file_identifier)
chd.load_emotes()

In [43]:
chd.set_window_step(step=100, window=200)
em_dens = chd.get_chat_measure("emote_density")

In [44]:
denses = np.concatenate([v for v in em_dens.values()])

In [45]:
np.any(np.isinf(denses))

False

In [46]:
np.any(np.isnan(denses))

False

## Best results for message density