In [3]:
import pandas as pd
import sys
sys.path.append('../')
from mongo_class import Mongo
from format_diarization import format_diarization

In [5]:
# Enable the cells which will overwrite the existing training and testing files
overwrite = False

## Get the dataframes from mongo

In [6]:
with Mongo(db_name='annotations', collection_name='self-annotated-2') as annotations:
    annotations_pd = pd.DataFrame(list(annotations.collection.find()))
with Mongo(db_name='videos', collection_name='all') as video_data:
    data_pd = pd.DataFrame(list(video_data.collection.find()))
with Mongo(db_name='videos', collection_name='diarization') as diarizations:
    diarizations_pd = pd.DataFrame(list(diarizations.collection.find()))
with Mongo(db_name='annotations', collection_name='mturk-annotations') as mturk_annotations:
    mturk_pd = pd.DataFrame(list(mturk_annotations.collection.find()))
fields_to_keep = ["Input_video_id", "Answer_is-a-call_most", "Answer_is-a-call_some", "Answer_is-a-call_none"]
data_pd = data_pd.merge(annotations_pd[fields_to_keep], how='inner', left_on='video_id', right_on='Input_video_id', suffixes=('', '_y'))
data_pd['transcription'] = data_pd['video_id'].apply(lambda x: format_diarization(diarizations_pd[diarizations_pd['video_id'] == x]['diarization'].values[0]) if diarizations_pd['video_id'].str.contains(x).any() else pd.NA)
data_pd['transcription_block'] = data_pd['transcription'].apply(lambda x: ' '.join([y['text'] for y in x]) if x is not pd.NA else pd.NA)
data_pd_orig = data_pd.copy()

## Output Dataframes to pkl

In [7]:
if overwrite:
    # split data into train, dev, test
    train = data_pd.sample(frac=0.8, random_state=42)
    #train['label'] = np.ones(len(train))*-1
    test = data_pd.drop(train.index)
    train.to_pickle('train.pkl')
    test.to_pickle('test.pkl')

## Create a different rotation of the training/testing split

In [10]:
train: pd.DataFrame = pd.read_pickle('train-test-splits/train.pkl')
test: pd.DataFrame = pd.read_pickle('train-test-splits/test.pkl')
all = pd.concat([train, test])

In [11]:
# create training and testing files, looping through each 200 block
test_split_so_far = test.copy(deep=True)
while len(test_split_so_far) < 1000:
    new_test = all[~all['video_id'].isin(test_split_so_far['video_id'])].sample(n=200, random_state=42)
    test_split_so_far = pd.concat([test_split_so_far, new_test])
    new_train = all[~all['video_id'].isin(new_test['video_id'])]
    new_train.to_pickle('train-test-splits/train_{}.pkl'.format(int(len(test_split_so_far)/200)))
    new_test.to_pickle('train-test-splits/test_{}.pkl'.format(int(len(test_split_so_far)/200)))
