# Finding Kermit
Training: Episode 02-01-01  
Test: Episode 02-04-04  
Validation: Episode 03-04-03  

## Video

In [34]:
# GET PREDICTIONS

## Audio

In [164]:
# DATA
train_data = pd.read_csv('../data/ep1_flat_mfcc.csv', sep=',', dtype=np.float64)
train_target = pd.read_csv('../data/gt/gt_02_01_01.csv', sep=',').head(len(train_data)).kermit_audio
test_data = pd.read_csv('../data/ep2_flat_mfcc.csv', sep=',', dtype=np.float64)
test_target = pd.read_csv('../data/gt/gt_02_04_04.csv.csv', na_values=[None, ' ', '']).fillna(0).head(len(test_data)).kermit_audio

In [41]:
# MODEL
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# Pipeline as exported from TPOT
pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=0.001, fit_prior=False)),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.2, min_samples_leaf=8, min_samples_split=4, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(pipeline.steps, 'random_state', 42)

pipeline.fit(train_data, train_target)

Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=BernoulliNB(alpha=0.001,
                                                         fit_prior=False))),
                ('randomforestclassifier',
                 RandomForestClassifier(max_features=0.2, min_samples_leaf=8,
                                        min_samples_split=4,
                                        random_state=42))])

In [168]:
# EVALUATE AUDIO PREDICTIONS ALONE
from sklearn.metrics import precision_score

# predict training data
print(precision_score(train_target, pipeline.predict(train_data)))

# predict test data
print(precision_score(test_target, pipeline.predict(test_data)))

1.0
0.8235294117647058


# Combined

In [166]:
# Load truth for episode 3
val_target = pd.read_csv('../data/gt/gt_03_04_03.csv', sep=',')

# Video predictions
video_predictions = np.round(np.random.random((len(train_target),)))

# Audio predictions
val_audio = pd.read_csv('../data/ep3_flat_mfcc.csv', sep=',', dtype=np.float64)
audio_predictions = pipeline.predict(val_audio)

In [173]:
# adjust lengths
min_len = min([len(val_target), len(video_predictions), len(audio_predictions)])
val_target = val_target.head(min_len)
video_predictions = video_predictions[:min_len]
audio_predictions = audio_predictions[:min_len]

In [178]:
val_target

Unnamed: 0,Min,Sec,kermit_video,kermit_audio,ws_video,ws_audio
0,0,0,0,0,0,0
1,0,1,0,0,0,0
2,0,2,0,0,0,0
3,0,3,0,0,0,0
4,0,4,0,0,0,0
...,...,...,...,...,...,...
1533,25,33,0,0,0,0
1534,25,34,0,0,0,0
1535,25,35,0,0,0,0
1536,25,36,0,0,0,0


In [204]:
# combine truths
or_truth = [val_target.kermit_video[i] or val_target.kermit_audio[i] for i in range(min_len)]
and_truth = [val_target.kermit_video[i] and val_target.kermit_audio[i] for i in range(min_len)]

# combine predictions
or_predictions = [video_predictions[i] or audio_predictions[i] for i in range(min_len)]
and_predictions = [video_predictions[i] and audio_predictions[i] for i in range(min_len)]

In [212]:
# OR truth
precision_score(or_truth, or_predictions)

0.4553686934023286

In [211]:
# AND truth
precision_score(and_truth, and_predictions)

1.0