# Bootstrapping

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# 임의로 작성한 노년층 설문을 제외

data = pd.read_csv("data/final_data.csv")[:401]
data

Unnamed: 0,시간대,성별,연령대,지역,기분,1순위,2순위,3순위,기온,강수량,...,초밥,우동,메밀소바,돈가스,볶음밥,짜장면,짬뽕,날짜,시간,요일
0,2021-05-16 14:00:00,남자,청년,수도권,행복,한식,일식,중식,17.5,1.6,...,2,0,0,2,0,0,1,05-16,14,일
1,2021-05-16 15:00:00,여자,중장년,서울,분노,한식,중식,일식,18.5,1.4,...,0,0,1,0,2,0,0,05-16,15,일
2,2021-05-16 15:00:00,남자,청년,서울,중립,일식,양식,한식,18.5,1.4,...,0,0,3,3,0,0,0,05-16,15,일
3,2021-05-16 15:00:00,여자,청년,서울,행복,일식,한식,분식,18.5,1.4,...,3,0,0,0,0,0,0,05-16,15,일
4,2021-05-16 16:00:00,남자,청년,경상도,행복,양식,일식,분식,20.6,2.6,...,2,0,0,2,0,0,0,05-16,16,일
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,2021-05-24 16:00:00,여자,노년,서울,분노,한식,일식,양식,24.1,0.0,...,2,0,0,0,0,0,0,05-24,16,월
397,2021-05-24 16:00:00,여자,노년,서울,행복,한식,일식,중식,24.1,0.0,...,2,0,0,0,1,0,0,05-24,16,월
398,2021-05-24 16:00:00,남자,노년,서울,우울,일식,한식,분식,24.1,0.0,...,0,0,0,3,0,0,0,05-24,16,월
399,2021-05-24 16:00:00,여자,노년,서울,행복,한식,일식,양식,24.1,0.0,...,0,2,0,0,0,0,0,05-24,16,월


# 함수
### - Bootstrap Sampling 함수 정의

In [9]:
def bootstrapping(data, size=None):
    '''
    data를 증식하는 함수
    
    * Parameter
    data, size : 증식할 data와 증식할 data의 size
    
    * Output
    return : dataframe 형태의 증식한 data
    '''
    
    # Array로 형변환
    data = data.values
    
    
    # Random Sampling
    np.random.seed(123)
    bs_data = np.random.choice(len(data), size=size)
    data = data[bs_data]
    
    
    # Column Names
    colname = ['시간대', '성별', '연령대', '지역', '기분', '1순위', '2순위', '3순위','기온', '강수량', '습도','기압', '전운량',
               '불고기', '냉면', '칼국수', '된장찌개', '비빔밥', '김치찌개', '떡볶이', '김밥', '파스타', '스테이크',
               '햄버거', '초밥', '우동', '메밀소바', '돈가스', '볶음밥', '짜장면', '짬뽕', '날짜', '시간', '요일']
    
    
    # Dataframe으로 변환 후, return
    result = pd.DataFrame(data=data, columns=colname)
    return result

In [10]:
bs = bootstrapping(data, size=30000)
bs.head()

Unnamed: 0,시간대,성별,연령대,지역,기분,1순위,2순위,3순위,기온,강수량,...,초밥,우동,메밀소바,돈가스,볶음밥,짜장면,짬뽕,날짜,시간,요일
0,2021-05-20 00:00:00,여자,청년,서울,행복,양식,한식,중식,18.7,0.0,...,0,0,0,0,1,0,0,05-20,0,목
1,2021-05-24 15:00:00,여자,노년,전라도,우울,분식,한식,일식,22.6,0.0,...,0,1,0,0,0,0,0,05-24,15,월
2,2021-05-18 23:00:00,여자,청소년,수도권,행복,한식,일식,분식,14.0,0.0,...,2,2,2,0,0,0,0,05-18,23,화
3,2021-05-16 18:00:00,여자,청년,서울,중립,양식,일식,분식,17.0,3.5,...,2,0,0,2,0,0,0,05-16,18,일
4,2021-05-18 13:00:00,남자,중장년,서울,행복,양식,중식,한식,22.2,0.0,...,0,0,0,0,2,0,0,05-18,13,화


### 기존 설문 데이터와 증식 데이터를 Concat

In [11]:
final_data = pd.concat([data, bs], ignore_index=True)
final_data

Unnamed: 0,시간대,성별,연령대,지역,기분,1순위,2순위,3순위,기온,강수량,...,초밥,우동,메밀소바,돈가스,볶음밥,짜장면,짬뽕,날짜,시간,요일
0,2021-05-16 14:00:00,남자,청년,수도권,행복,한식,일식,중식,17.5,1.6,...,2,0,0,2,0,0,1,05-16,14,일
1,2021-05-16 15:00:00,여자,중장년,서울,분노,한식,중식,일식,18.5,1.4,...,0,0,1,0,2,0,0,05-16,15,일
2,2021-05-16 15:00:00,남자,청년,서울,중립,일식,양식,한식,18.5,1.4,...,0,0,3,3,0,0,0,05-16,15,일
3,2021-05-16 15:00:00,여자,청년,서울,행복,일식,한식,분식,18.5,1.4,...,3,0,0,0,0,0,0,05-16,15,일
4,2021-05-16 16:00:00,남자,청년,경상도,행복,양식,일식,분식,20.6,2.6,...,2,0,0,2,0,0,0,05-16,16,일
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30396,2021-05-18 17:00:00,남자,청년,수도권,우울,일식,양식,중식,21.9,0,...,3,0,0,0,0,1,0,05-18,17,화
30397,2021-05-16 17:00:00,여자,중장년,경상도,중립,중식,한식,분식,20.1,2.2,...,0,0,0,0,0,0,3,05-16,17,일
30398,2021-05-16 18:00:00,남자,청년,경상도,중립,일식,한식,분식,19.7,2.7,...,3,0,3,0,0,0,0,05-16,18,일
30399,2021-05-18 20:00:00,여자,중장년,경상도,우울,한식,분식,일식,17.4,0,...,0,1,0,0,0,0,0,05-18,20,화


### 증식한 데이터를 저장

In [12]:
final_data.to_csv("data/final_concat_data.csv",index=False)