In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# Import data from csv
df = pd.read_csv('../datasets/cleaned/combined_stemmed_text_with_category_label.csv')

In [3]:
category_list = [
    'appliances',
    'arts_crafts_and_sewing',
    'automotive',
    'baby',
    'beauty',
    'cell_phones_and_accessories',
    'clothing_shoes_and_jewelry',
    'electronics',
    'grocery_and_gourmet_food',
    'health_and_personal_care',
    'home_and_kitchen',
    'industrial_and_scientific',
    'musical_instruments',
    'office_products',
    'patio_lawn_and_garden',
    'pet_supplies',
    'software',
    'sports_and_outdoors',
    'tools_and_home_improvement',
    'toys_and_games',
    'video_games',
]

In [4]:
df.drop(columns=['category', 'text'], inplace=True)
df.dropna(inplace=True)

In [38]:
df['label'].value_counts(sort=False, normalize=False)

0       8868
1      20750
2      87773
3      25504
4      36198
5      78430
6       8409
7     297727
8      18650
9      73203
10    170924
11     11577
12     22932
13     42467
14     58411
15     33596
16      8067
17    109055
18     98713
19     50573
20      9131
Name: label, dtype: int64

In [32]:
def balanced_sample(data, n, random_state=42):
    count = data['label'].value_counts(sort=False, normalize=False)
    samples= pd.DataFrame()
    for i in range(len(count)):
        if count[i] > n:
            sample = data.groupby('label').get_group(i).sample(
                n=n,
                random_state=random_state,
                replace=False,
            )
            samples = samples.append(sample)
            print(f"sampled {sample.shape[0]} items from group {i}")
        else:
            sample = df.groupby('label').get_group(i)
            samples = samples.append(sample)
            print(f"add all {sample.shape[0]} items from group {i}")
    print(f"total sample size: {samples.shape[0]}")
    return samples

In [37]:
smallest_size = df['label'].value_counts().values.min()
downsampled_df = balanced_sample(df, smallest_size)

sampled 8067 items from group 0
sampled 8067 items from group 1
sampled 8067 items from group 2
sampled 8067 items from group 3
sampled 8067 items from group 4
sampled 8067 items from group 5
sampled 8067 items from group 6
sampled 8067 items from group 7
sampled 8067 items from group 8
sampled 8067 items from group 9
sampled 8067 items from group 10
sampled 8067 items from group 11
sampled 8067 items from group 12
sampled 8067 items from group 13
sampled 8067 items from group 14
sampled 8067 items from group 15
add all 8067 items from group 16
sampled 8067 items from group 17
sampled 8067 items from group 18
sampled 8067 items from group 19
sampled 8067 items from group 20
total sample size: 169407


In [39]:
downsampled_df

Unnamed: 0,label,stem_text,lemm_text
2319,0,come strainer washer strainer rubber washer in...,come strainer washer strainer rubber washer in...
6486,0,part work frigidair washer model fccw fs hello...,part work frigidaire washer model fccw f hello...
1937,0,fit kenmor pro seri refriger question ask manu...,fit kenmore pro series refrigerator question a...
4896,0,actual genuin whirlpool part know mani thing l...,actually genuine whirlpool part know many thin...
222,0,pack filter total filter total filter two filt...,pack filter total filter total filter two filt...
...,...,...,...
1265068,20,charger support v,charger support v
1266444,20,includ usb cabl must state box mine,include usb cable must state box mine
1266680,20,come digit code make game work origin instruct...,come digit code make game work original instru...
1270704,20,im gonna get friend work xbox there like usb s...,im gonna get friend work xbox there like usb s...


In [40]:
downsampled_df['label'] = downsampled_df['label'].astype(int)
downsampled_df.dtypes

label         int32
stem_text    object
lemm_text    object
dtype: object

In [44]:
small_sample_df = balanced_sample(downsampled_df, 250)

sampled 250 items from group 0
sampled 250 items from group 1
sampled 250 items from group 2
sampled 250 items from group 3
sampled 250 items from group 4
sampled 250 items from group 5
sampled 250 items from group 6
sampled 250 items from group 7
sampled 250 items from group 8
sampled 250 items from group 9
sampled 250 items from group 10
sampled 250 items from group 11
sampled 250 items from group 12
sampled 250 items from group 13
sampled 250 items from group 14
sampled 250 items from group 15
sampled 250 items from group 16
sampled 250 items from group 17
sampled 250 items from group 18
sampled 250 items from group 19
sampled 250 items from group 20
total sample size: 5250


In [45]:
small_sample_df

Unnamed: 0,label,stem_text,lemm_text
5883,0,work ceil come stainless sleev top adjust may ...,work ceiling come stainless sleeve top adjust ...
188,0,much clearenc need leav stovetop mine like sto...,much clearence need leave stovetop mine like s...
1454,0,would yoy chang filter look like get product f...,would yoy change filter look like get product ...
6929,0,come differ color facepl ye sold separ,come different color faceplate yes sold separa...
5193,0,work ghw pl work ghw lw abl cross refer take b...,work ghw pl worked ghw lw able cross reference...
...,...,...,...
1263293,20,risk charg batteri shorten lifespan charger st...,risk charging battery shortening lifespan char...
1263043,20,control need play game control come game conso...,controller need play game control come game co...
1268373,20,dream world cheesey got moment fun,dream world cheesey got moment fun
1268352,20,mani player play time one system four peopl pl...,many player play time one system four people p...


In [46]:
small_sample_df.dtypes

label         int32
stem_text    object
lemm_text    object
dtype: object

In [47]:
downsampled_df.label.value_counts(normalize=True)

20    0.047619
9     0.047619
1     0.047619
2     0.047619
3     0.047619
4     0.047619
5     0.047619
6     0.047619
7     0.047619
8     0.047619
10    0.047619
19    0.047619
11    0.047619
12    0.047619
13    0.047619
14    0.047619
15    0.047619
16    0.047619
17    0.047619
18    0.047619
0     0.047619
Name: label, dtype: float64

In [48]:
small_sample_df.label.value_counts(normalize=True)

19    0.047619
13    0.047619
4     0.047619
8     0.047619
12    0.047619
16    0.047619
20    0.047619
1     0.047619
5     0.047619
9     0.047619
17    0.047619
15    0.047619
2     0.047619
6     0.047619
10    0.047619
14    0.047619
18    0.047619
3     0.047619
7     0.047619
11    0.047619
0     0.047619
Name: label, dtype: float64

In [51]:
downsampled_df.to_csv('../datasets/sampled/downsample_full.csv', index=False)

In [52]:
small_sample_df.to_csv('../datasets/sampled/downsample_small.csv', index=False)