In [6]:
import os
import re
import random
import numpy as np
import pandas as pd

In [7]:
PATH = r"G:\My Drive\Colab Notebooks\data\amazon"

In [50]:
### load data
df_col = []
for post_fix in ['train', 'valid', 'test']:
    df_col.append(pd.read_csv(os.path.join(PATH, f'amazon_{post_fix}.csv'))[['text', 'label_text']])
df_data = pd.concat(df_col)
# remove the underscore _ in labels (maybe no need because bert tokenizer is smart)
df_data.label_text = df_data.label_text.apply(lambda x: x.replace('_', ' '))
# add label_id column
df_data['label_id'] = pd.factorize(df_data['label_text'])[0] + 1
print(f'There are {len(df_data)} samples in total, and {len(df_data.label_id.unique())} unique labels.')
# house-keeping [remove not-used data from cache]
del df_col

There are 16521 samples in total, and 60 unique labels.


In [38]:
### some statistics
df_stats = df_data[['label_id']].value_counts().reset_index(name='counts')
print(f'Number of sample per class: {df_stats.counts.mean()}.')
df_stats.describe()

Number of sample per class: 275.35.


Unnamed: 0,label_id,counts
count,60.0,60.0
mean,30.5,275.35
std,17.464249,253.817352
min,1.0,6.0
25%,15.75,114.75
50%,30.5,183.0
75%,45.25,315.5
max,60.0,1150.0


In [51]:
### remove classes that has less than 100 samples
threshold = 100
label_ids = set(df_stats[df_stats.counts < threshold].label_id.values)
# notice, we are overriding the org data
# create a new variable takes extra memory
df_data = df_data[~df_data.label_id.isin(label_ids)] 
df_data['label_id'] = pd.factorize(df_data['label_text'])[0] + 1
num_labels = len(df_data.label_id.unique())
print(f'There are {len(df_data)} samples in total, and {num_labels} unique labels.')

There are 13039 samples in total, and 49 unique labels.


In [77]:
### select 30 pre-defined labels
random.seed(42)
k = 30
sampled_labels = random.sample(range(1, num_labels+1), k)
df_data['pre_defined'] = df_data.label_id.apply(lambda x: True if x in sampled_labels else False)

In [81]:
### export
df_data.to_csv('data/sample_data.csv', index=False)