# GoEmotions Instance Count Analysis
Count the avg. instances per class in GoEmotions to build a comparable dataset

In [1]:
import pandas as pd

vent = pd.read_parquet('../preprocessed/vent-robust.parquet')
goemotions = pd.read_parquet('../preprocessed/GoEmotions.parquet')

In [2]:
vents_per_emotion = vent.groupby('emotions_label').size().mean()
vent_emotions = len(vent.emotions_label.unique())

exploded = goemotions.emotions.explode().to_frame()
comments_per_emotion = exploded.groupby('emotions').size().mean()
goem_emotions = len(exploded.emotions.unique())

In [3]:
equivalent_vents = comments_per_emotion * vent_emotions
vent_sample_size = equivalent_vents / len(vent)

print(f'There are {goem_emotions} emotions in GoEmotions with {comments_per_emotion} comments on average.')
print(f'The equivalent dataset needs {equivalent_vents} vents, or {100 * vent_sample_size:.2f}% of Vent.')

There are 28 emotions in GoEmotions with 2279.0 comments on average.
The equivalent dataset needs 200552.0 vents, or 2.06% of Vent.


In [4]:
import sys
sys.path.append('../src')

from utils.split import sorted_splits

splits = sorted_splits(vent.sample(frac=vent_sample_size), 'created_at', [0.8, 0.1, 0.1])
for df, split_name in zip(splits, ['train', 'valid', 'test']):
    df['split'] = split_name

In [5]:
print(splits[0].created_at.min(), '-',
      splits[0].created_at.max(), '\n',
      splits[1].created_at.min(), '-',
      splits[1].created_at.max(), '\n',
      splits[2].created_at.min(), '-',
      splits[2].created_at.max())

2016-07-01 00:01:40.760000 - 2018-03-16 04:52:35.350000 
 2018-03-16 05:08:45.481000 - 2018-07-22 17:23:24.146000 
 2018-07-22 17:23:29.857000 - 2018-12-14 05:24:08.854000


In [6]:
full_robust_sample = pd.concat(splits, ignore_index=True)
full_robust_sample.to_parquet('../preprocessed/vent-robust-equivalent-sample.parquet')

In [9]:
full_splits = sorted_splits(vent.sample(frac=1.0), 'created_at', [0.8, 0.1, 0.1])
for df, split_name in zip(full_splits, ['train', 'valid', 'test']):
    df['split'] = split_name

full_robust_sample = pd.concat(full_splits, ignore_index=True)
full_robust_sample.to_parquet('../preprocessed/vent-robust-splits.parquet')