# Bootstrapping

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 임의로 작성한 노년층 설문을 제외

data = pd.read_csv("data/final_data.csv", index_col=0)[:401]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401 entries, 0 to 400
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   시간대     401 non-null    object 
 1   성별      401 non-null    object 
 2   연령대     401 non-null    object 
 3   지역      401 non-null    object 
 4   기분      401 non-null    object 
 5   1순위     401 non-null    object 
 6   2순위     401 non-null    object 
 7   3순위     401 non-null    object 
 8   기온      401 non-null    float64
 9   강수량     401 non-null    float64
 10  습도      401 non-null    float64
 11  기압      401 non-null    float64
 12  전운량     401 non-null    int64  
 13  불고기     341 non-null    float64
 14  냉면      341 non-null    float64
 15  칼국수     341 non-null    float64
 16  된장찌개    341 non-null    float64
 17  비빔밥     341 non-null    float64
 18  김치찌개    341 non-null    float64
 19  떡볶이     138 non-null    float64
 20  김밥      138 non-null    float64
 21  파스타     241 non-null    float64
 22  스테

# 함수
### - Bootstrap Sampling 함수 정의

In [3]:
def bootstrapping(data, size=None):
    '''
    data를 증식하는 함수
    
    * Parameter
    data, size : 증식할 data와 증식할 data의 size
    
    * Output
    return : dataframe 형태의 증식한 data
    '''
    
    # Array로 형변환
    data = data.values
    
    
    # Random Sampling
    np.random.seed(123)
    bs_data = np.random.choice(len(data), size=size)
    data = data[bs_data]
    
    
    # Column Names
    colname = ['시간대', '성별', '연령대', '지역', '기분', '1순위', '2순위', '3순위','기온', '강수량', '습도','기압', '전운량',
               '불고기', '냉면', '칼국수', '된장찌개', '비빔밥', '김치찌개', '떡볶이', '김밥', '파스타', '스테이크',
               '햄버거', '초밥', '우동', '메밀소바', '돈가스', '볶음밥', '짜장면', '짬뽕', '날짜', '시간', '요일']
    
    
    # Dataframe으로 변환 후, return
    result = pd.DataFrame(data=data, columns=colname)
    return result

In [4]:
bs = bootstrapping(data, size=20000)
bs.head()

Unnamed: 0,시간대,성별,연령대,지역,기분,1순위,2순위,3순위,기온,강수량,...,초밥,우동,메밀소바,돈가스,볶음밥,짜장면,짬뽕,날짜,시간,요일
0,2021-05-19 14:00:00,남자,청년,서울,분노,중식,한식,분식,25.7,0.0,...,,,,,0.0,0.0,1.0,05-19,14,수
1,2021-05-19 20:00:00,여자,청년,서울,행복,일식,분식,한식,22.1,0.0,...,1.0,0.0,0.0,0.0,,,,05-19,20,수
2,2021-05-18 18:00:00,여자,청소년,수도권,우울,일식,한식,일식,21.0,0.0,...,1.0,0.0,1.0,0.0,,,,05-18,18,화
3,2021-05-16 18:00:00,남자,청소년,경상도,중립,분식,일식,한식,19.7,2.7,...,0.0,0.0,0.0,1.0,,,,05-16,18,일
4,2021-05-18 12:00:00,남자,중장년,수도권,행복,한식,일식,양식,19.4,0.0,...,1.0,0.0,0.0,0.0,,,,05-18,12,화


### 기존 설문 데이터와 증식 데이터를 Concat

In [5]:
final_data = pd.concat([data, bs], ignore_index=True)
final_data

Unnamed: 0,시간대,성별,연령대,지역,기분,1순위,2순위,3순위,기온,강수량,...,초밥,우동,메밀소바,돈가스,볶음밥,짜장면,짬뽕,날짜,시간,요일
0,2021-05-16 14:00:00,남자,청년,수도권,행복,한식,일식,중식,17.5,1.6,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,05-16,14,일
1,2021-05-16 15:00:00,여자,중장년,서울,분노,한식,중식,일식,18.5,1.4,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,05-16,15,일
2,2021-05-16 15:00:00,남자,청년,서울,중립,일식,양식,한식,18.5,1.4,...,0.0,0.0,1.0,1.0,,,,05-16,15,일
3,2021-05-16 15:00:00,여자,청년,서울,행복,일식,한식,분식,18.5,1.4,...,1.0,0.0,0.0,0.0,,,,05-16,15,일
4,2021-05-16 16:00:00,남자,청년,경상도,행복,양식,일식,분식,20.6,2.6,...,1.0,0.0,0.0,1.0,,,,05-16,16,일
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20396,2021-05-19 10:00:00,남자,중장년,수도권,우울,한식,일식,양식,19.1,0.0,...,1.0,0.0,1.0,0.0,,,,05-19,10,수
20397,2021-05-18 12:00:00,여자,청년,서울,행복,한식,중식,분식,20.5,0.0,...,,,,,0.0,0.0,1.0,05-18,12,화
20398,2021-05-16 16:00:00,남자,중장년,서울,행복,한식,분식,양식,17.7,4.8,...,,,,,,,,05-16,16,일
20399,2021-05-16 21:00:00,남자,청년,수도권,중립,한식,양식,일식,14.3,1.3,...,0.0,1.0,1.0,0.0,,,,05-16,21,일


### 증식한 데이터를 저장

In [6]:
final_data.to_csv("data/final_concat_data.csv")