In [33]:
!pip install lightgbm
!pip install scikit-optimize

[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable


In [60]:
from typing import List, Union, Tuple
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from skopt import BayesSearchCV
from skopt.space.space import Integer
import numpy as np
import pickle

In [44]:

class AlignedSentence:
    def __init__(self, sentence: str, start_time: Union[float, None], end_time: Union[float, None]):
        self.sentence = sentence
        self.start_time = start_time
        self.end_time = end_time

def read_sentence_alignment(path_to_alignment) -> List[AlignedSentence]:
    with open(path_to_alignment, encoding='utf-8') as f:
        alignment = []
        last_end_time = None
        for line in f:
            start_time, end_time, sentence = line.strip().split('\t', maxsplit=2)
            start_time = float(start_time)
            end_time = float(end_time)
            if start_time < 0:
                start_time = None
            if end_time < 0:
                end_time = None
            if start_time is not None and end_time is not None:
                assert start_time <= end_time, f'start_time={start_time}, end_time={end_time}'
                if last_end_time is not None:
                    if start_time < last_end_time:
                        print(f'WARNING: overlapping intervals, start_time={start_time} < last_end_time={last_end_time}')
            last_end_time = end_time

            alignment.append(AlignedSentence(sentence, start_time, end_time))

    return alignment

In [45]:
true_alignments = read_sentence_alignment("../data/true_timestamps_0224_without_pauses.txt")
estimated_alignments = read_sentence_alignment("../data/estimated_timestamps_0224.txt")

In [46]:
def calculate_ious(sentence_alignment_1: List[AlignedSentence], sentence_alignment_2: List[AlignedSentence]):
        ious = []
        for aligned_sentence_1, aligned_sentence_2 in zip(sentence_alignment_1, sentence_alignment_2):
            start_time_1 = aligned_sentence_1.start_time
            start_time_2 = aligned_sentence_2.start_time
            end_time_1 = aligned_sentence_1.end_time
            end_time_2 = aligned_sentence_2.end_time
            if start_time_1 is None or start_time_2 is None:
                ious.append(None)
            else:
                intersection = min(end_time_1, end_time_2) - max(start_time_1, start_time_2)
                intersection = 0.0 if intersection < 0.0 else intersection
                union = max(end_time_1, end_time_2) - min(start_time_1, start_time_2)
                assert union > 0.0
                iou = intersection / union
                ious.append(iou)

        return ious

In [47]:
print(estimated_alignments[2].start_time)

18.21


In [48]:
ious = calculate_ious(true_alignments, estimated_alignments)

In [49]:
print(ious)

[0.06, 0.9123997532387415, 0.14234234234234217, 0.693099273607748, 0.3573487031700285, 0.5153284671532847, 0.7027363184079601, 0.7299630086313196, 0.15451388888888923, 0.7369439071566738, 0.6522043386983913, 0.3137614678899104, 0.8975054229934931, 0.7185104052573938, 0.5304990757855834, 0.8235604860010564, 0.7103873239436623, 0.6295793758480341, 0.8456642800318219, 0.6328571428571438, 0.34482758620689585, 0.705679862306367, 0.6889848812095034, 0.668218859138534, 0.790432801822324, 0.3860759493670905, 0.7788235294117636, 0.5697211155378445, 0.9124087591240846, 0.7962577962577956, 0.7069645203679351, 0.5092807424593949, 0.0, 0.5751004016064283, 0.6132075471698115, 0.3607305936073027, 0.9037499999999987, 0.8403361344537751, 0.690624999999996, 0.5389221556886302, 0.7316666666666739, 0.6024096385542181, 0.43981481481480444, 0.8929159802306414, 0.7940379403794061, 0.8619631901840522, 0.6247755834829484, 0.37931034482758985, 0.593085106382985, 0.06504065040650657, 0.3179634966378477, 0.179276

In [50]:
X_features = pd.read_csv("../data/X_features.csv")

In [51]:
print(X_features)

     confidence     score  length_ratio  chars_per_second
0      0.843409 -0.455796      1.562500         11.961722
1      0.943141 -0.389824      1.106977         14.875000
2      0.834532 -0.773286      0.517241          3.456221
3      0.851718 -0.648332      0.566667          3.932151
4      0.873994 -0.469443      1.073171          8.118081
..          ...       ...           ...               ...
283    0.966948 -0.160701      0.930556         13.188976
284    0.946148 -0.373308      1.129870         18.510638
285    0.954532 -0.431939      1.168142         15.612064
286    0.861879 -0.380734      1.340426         20.160000
287    0.864784 -0.791969      0.666667          0.339367

[288 rows x 4 columns]


In [52]:
y_target = ious

In [53]:
def scorer(estimator, X, y):
                y_pred = np.clip(np.squeeze(estimator.predict(X)), 0.0, 1.0)
                return -mean_absolute_error(y, y_pred)

opt = BayesSearchCV(
                LGBMRegressor(),
                {
                    'num_leaves': Integer(
                        2, 128,
                        prior='log-uniform', base=2
                    ),
                    'min_child_samples': Integer(
                        2, 512,
                        prior='log-uniform', base=2
                    ),
                    'max_bin': Integer(
                        2, 8192,
                        prior='log-uniform', base=2
                    ),
                },
                n_iter=60,
                optimizer_kwargs={
                    'n_initial_points': 20,
                    'base_estimator': 'GP',
                },
                scoring=scorer,
                cv=3,
                refit=False,
                random_state=42,
                return_train_score=True,
            )

In [54]:
opt.fit(X_features, y_target)

In [55]:
print(f'Found hyperparams {opt.best_params_}')

Found hyperparams OrderedDict([('max_bin', 98), ('min_child_samples', 3), ('num_leaves', 5)])


In [56]:
estimator = LGBMRegressor(**opt.best_params_)

In [57]:
estimator.fit(X_features, y_target)

In [58]:
preds = estimator.predict(X_features)

In [59]:
print(np.median(preds))

0.6964208788151437


In [61]:
pickle.dump(estimator, open("iou_estimator.pkl", "wb"))