# sampling_data.ipynb
---
현재 학습데이터가 너무 많기에 학습데이터를 sampling하는 전처리 추가  

In [1]:
import os
import sys
import json
import pandas as pd

from tqdm import tqdm
from glob import glob
from collections import defaultdict

## 현재 학습데이터 로드

In [15]:
train_df = pd.read_csv('../data/train.csv')

In [16]:
train_df

Unnamed: 0,image_path,bbox,labels
0,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,401441431435,화남/불쾌
1,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358447472443,화남/불쾌
2,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358445472445,화남/불쾌
3,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,428183704749,화남/불쾌
4,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,51222621880,화남/불쾌
...,...,...,...
2128046,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,4124310961011,행복/즐거움
2128047,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,5241021094949,행복/즐거움
2128048,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,541410974550,행복/즐거움
2128049,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,517342976532,행복/즐거움


총 데이터 212만장

In [5]:
label_map = train_df['labels'].unique()
label_map = {idx:label for idx, label in enumerate(label_map)}
label_map

{0: '화남/불쾌', 1: '공포', 2: '행복/즐거움', 3: '편안/안정', 4: '공격성', 5: '불안/슬픔'}

In [6]:
inverted_label_map = train_df['labels'].unique()
inverted_label_map = {label:idx for idx, label in enumerate(inverted_label_map)}
inverted_label_map

{'화남/불쾌': 0, '공포': 1, '행복/즐거움': 2, '편안/안정': 3, '공격성': 4, '불안/슬픔': 5}

In [7]:
data_dict = defaultdict(list)
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    label = row['labels']
    int_label = inverted_label_map[label]
    data_dict[int_label].append(dict(row))

100%|██████████| 2128051/2128051 [01:22<00:00, 25697.39it/s]


In [8]:
data_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5])

In [9]:
for label, data_list in data_dict.items():
    print(f"{label} : {len(data_list)}")

0 : 74579
1 : 4795
2 : 515278
3 : 1364848
4 : 115619
5 : 52932


데이터의 imbalance가 심하다.  
class_weight가 있어서 활용 가능할 듯  
[링크](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data?hl=ko)

랜덤하게 샘플링하기위해서 사이킷런 활용

In [17]:
train_df['labels'] = train_df['labels'].apply(lambda x: inverted_label_map[x])

In [18]:
train_df.head()

Unnamed: 0,image_path,bbox,labels
0,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,401441431435,0
1,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358447472443,0
2,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,358445472445,0
3,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,428183704749,0
4,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,51222621880,0


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train_1, x_train_2, y_train_1, y_train_2 = train_test_split(train_df[train_df.columns.difference(['labels'])], train_df['labels'], test_size=0.5, shuffle=True, stratify=train_df['labels'], random_state=94)

In [21]:
y_train_1

834165     3
438897     3
1965141    2
1449433    3
357986     4
          ..
2096884    3
843757     3
2066519    3
561354     2
1507047    3
Name: labels, Length: 1064025, dtype: int64

In [22]:
x_train_1['labels'] = y_train_1
x_train_1

Unnamed: 0,bbox,image_path,labels
834165,343338711015,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
438897,570168198402,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1965141,292484740998,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,2
1449433,730430653496,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
357986,343103470311,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,4
...,...,...,...
2096884,11240184237,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,3
843757,2801131392769,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
2066519,341144408513,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,3
561354,563571192999,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2


In [23]:
x_train_1.to_csv('../data/sampled_train_1.csv', index=False)

In [24]:
x_train_2['labels'] = y_train_2
x_train_2

Unnamed: 0,bbox,image_path,labels
1253152,298646701352,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,0
871986,85255821073,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
783716,76586221047,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1536826,541137334286,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,2
1767514,2783291171551,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,3
...,...,...,...
447688,326196862620,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
2019609,32191153855,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,2
496507,411791019974,/media/jaeho/SSD/datasets/pet_data/train/cat/i...,3
1873272,4958772602,/media/jaeho/SSD/datasets/pet_data/train/dog/i...,0


In [25]:
x_train_2.to_csv('../data/sampled_train_2.csv', index=False)

In [26]:
import numpy as np

In [27]:
from utils import generate_class_weights

ModuleNotFoundError: No module named 'utils'

In [None]:
y_train_1.values

In [None]:
generate_class_weights(y_train_1.values, multi_class=True, one_hot_encoded=False)