In [1]:
import numpy as np
import pandas as pd

In [2]:
vent = pd.read_parquet('../preprocessed/vent-robust-splits.parquet')

# Scaling Analysis

We study how the amount of data effects performance in Emotion Detection tasks. We take the Vent data set, and train classifiers with the same number of hyper-parameters over different samples of the data. 

In [3]:
# Seed for determinism, even if one-off
SEED = 7
np.random.seed(SEED)
vent['randomProbability'] = np.random.random(len(vent))

for subset_size in [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]:
    dataset_name = f'{round(subset_size * 100)}-pct'
    vent_subset = vent[(vent.split == 'test') | 
                       (vent.randomProbability <= subset_size)].drop('randomProbability', axis='columns')
    vent_subset.to_parquet(f'../preprocessed/vent-robust-splits-{dataset_name}.parquet')

# Backwards Robust Vent

We want to test whether communities converge over time. That is, we train with temporally-ordered data and evaluate if we capture patterns that repeat in the future as new community members adapt to community trends. We test the hypothesis by training 'backwards in time': the test and validation splits are taken from the beginning of the data set, while the training data set comprises its tail. 

In [4]:
import sys
sys.path.append('../src')

from utils.split import sorted_splits

splits = sorted_splits(vent, 'created_at', [0.1, 0.1, 0.8])
for df, split_name in zip(splits, ['test', 'valid', 'train']):
    df['split'] = split_name

In [5]:
print(splits[0].created_at.min(), '-',
      splits[0].created_at.max(), '\n',
      splits[1].created_at.min(), '-',
      splits[1].created_at.max(), '\n',
      splits[2].created_at.min(), '-',
      splits[2].created_at.max())

2016-07-01 00:00:05.685000 - 2016-08-16 15:08:37.866000 
 2016-08-16 15:08:38.151000 - 2016-10-08 12:16:25.149000 
 2016-10-08 12:16:34.336000 - 2018-12-14 08:12:36.867000


In [6]:
backwards_robust = pd.concat(splits, ignore_index=True)
backwards_robust.to_parquet('../preprocessed/vent-robust-splits-backwards.parquet')