# sampling_data.ipynb
---
현재 학습데이터가 너무 많기에 학습데이터를 sampling하는 전처리 추가  

In [1]:
import os
import sys
import json
import pandas as pd

from tqdm import tqdm
from glob import glob
from collections import defaultdict

## 현재 학습데이터 로드

In [2]:
train_df = pd.read_csv('../data/train.csv')

In [3]:
train_df

Unnamed: 0,image_path,bbox,labels
0,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,401441431435,화남/불쾌
1,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358447472443,화남/불쾌
2,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358445472445,화남/불쾌
3,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,428183704749,화남/불쾌
4,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,51222621880,화남/불쾌
...,...,...,...
2127957,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,4124310961011,행복/즐거움
2127958,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,5241021094949,행복/즐거움
2127959,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,541410974550,행복/즐거움
2127960,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,517342976532,행복/즐거움


총 데이터 212만장

In [4]:
label_map = train_df['labels'].unique()
label_map = {idx:label for idx, label in enumerate(label_map)}
label_map

{0: '화남/불쾌', 1: '공포', 2: '행복/즐거움', 3: '편안/안정', 4: '공격성', 5: '불안/슬픔'}

In [5]:
inverted_label_map = train_df['labels'].unique()
inverted_label_map = {label:idx for idx, label in enumerate(inverted_label_map)}
inverted_label_map

{'화남/불쾌': 0, '공포': 1, '행복/즐거움': 2, '편안/안정': 3, '공격성': 4, '불안/슬픔': 5}

In [6]:
data_dict = defaultdict(list)
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    label = row['labels']
    int_label = inverted_label_map[label]
    data_dict[int_label].append(dict(row))

100%|██████████| 2127962/2127962 [01:05<00:00, 32408.47it/s]


In [7]:
data_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5])

In [8]:
for label, data_list in data_dict.items():
    print(f"{label} : {len(data_list)}")

0 : 74578
1 : 4795
2 : 515248
3 : 1364794
4 : 115619
5 : 52928


데이터의 imbalance가 심하다.  
class_weight가 있어서 활용 가능할 듯  
[링크](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data?hl=ko)

랜덤하게 샘플링하기위해서 사이킷런 활용

In [9]:
train_df['labels'] = train_df['labels'].apply(lambda x: inverted_label_map[x])

In [10]:
train_df.head()

Unnamed: 0,image_path,bbox,labels
0,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,401441431435,0
1,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358447472443,0
2,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358445472445,0
3,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,428183704749,0
4,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,51222621880,0


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train_1, x_train_2, y_train_1, y_train_2 = train_test_split(train_df[train_df.columns.difference(['labels'])], train_df['labels'], test_size=0.5, shuffle=True, stratify=train_df['labels'], random_state=94)

In [13]:
y_train_1

901100     2
51620      2
223245     2
559228     3
50098      2
          ..
928039     2
352530     3
1365016    2
2012718    2
579908     2
Name: labels, Length: 1063981, dtype: int64

In [14]:
x_train_1['labels'] = y_train_1
x_train_1

Unnamed: 0,bbox,image_path,labels
901100,193419001023,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
51620,4723141445702,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
223245,7393411801009,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
559228,26070794475,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
50098,683511749502,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
...,...,...,...
928039,23741289752,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
352530,308201848403,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1365016,29897572731,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
2012718,284176338297,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,2


In [15]:
x_train_1.to_csv('../data/sampled_train_1.csv', index=False)

In [16]:
x_train_2['labels'] = y_train_2
x_train_2

Unnamed: 0,bbox,image_path,labels
1937317,3566210651328,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,3
2125714,42634112364,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,3
720986,17627811451117,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1036588,364160481459,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
980859,408164378498,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,0
...,...,...,...
1329378,16242765508,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
1106158,373100416549,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1075416,627162515671,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
844498,122189655851,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3


In [17]:
x_train_2.to_csv('../data/sampled_train_2.csv', index=False)

In [26]:
import numpy as np

In [27]:
from utils import generate_class_weights

ModuleNotFoundError: No module named 'utils'

In [None]:
y_train_1.values

In [None]:
generate_class_weights(y_train_1.values, multi_class=True, one_hot_encoded=False)