# GoEmotions Instance Count Analysis
Count the avg. instances per class in GoEmotions to build a comparable dataset

In [3]:
import pandas as pd

vent = pd.read_parquet('../preprocessed/vent-robust.parquet')
goemotions = pd.read_parquet('../preprocessed/GoEmotions.parquet')

In [25]:
vents_per_emotion = vent.groupby('emotions_label').size().mean()
vent_emotions = len(vent.emotions_label.unique())

exploded = goemotions.emotions.explode().to_frame()
comments_per_emotion = exploded.groupby('emotions').size().mean()
goem_emotions = len(exploded.emotions.unique())

In [31]:
equivalent_vents = comments_per_emotion * vent_emotions
vent_sample_size = equivalent_vents / len(vent)

print(f'There are {goem_emotions} emotions in GoEmotions with {comments_per_emotion} comments on average.')
print(f'The equivalent dataset needs {equivalent_vents} vents, or {100 * vent_sample_size:.2f}% of Vent.')

There are 28 emotions in GoEmotions with 2279.0 comments on average.
The equivalent dataset needs 200552.0 vents, or 2.06% of Vent.


In [40]:
import sys
sys.path.append('../src')

from utils.split import sorted_splits

splits = sorted_splits(vent.sample(frac=vent_sample_size), 'created_at', [0.8, 0.1, 0.1])
for df, split_name in zip(splits, ['train', 'valid', 'test']):
    df['split'] = split_name

In [55]:
print(splits[0].created_at.min(), '-',
      splits[0].created_at.max(), '\n',
      splits[1].created_at.min(), '-',
      splits[1].created_at.max(), '\n',
      splits[2].created_at.min(), '-',
      splits[2].created_at.max())

2016-07-01 00:02:48.706000 - 2018-03-16 03:11:24.346000 
 2018-03-16 03:36:46.503000 - 2018-07-20 21:38:48.822000 
 2018-07-20 21:40:20.554000 - 2018-12-14 05:47:01.042000
