In [1]:
import pandas as pd
import numpy as np
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform

if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')

In [2]:
def load_dataset(): 
    sales = pd.read_excel("sales.xlsx",header =1)
    view_rate = pd.read_excel("view_rate.xlsx",header =1, index_col= 0)
    return sales, view_rate

sales = load_dataset()[0]
view_rate = load_dataset()[1]

In [3]:
sales.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0
1,2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0
3,2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0


In [4]:
view_rate.head()

Unnamed: 0_level_0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06,2019-01-07,2019-01-08,2019-01-09,2019-01-10,...,2019-12-23,2019-12-24,2019-12-25,2019-12-26,2019-12-27,2019-12-28,2019-12-29,2019-12-30,2019-12-31,2019-01-01 to 2019-12-31
시간대,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003
02:01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012,0.003
02:02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004
02:03,0.0,0.0,0.014,0.0,0.0,0.0,0.0,0.0,0.0,0.005,...,0.0,0.0,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.004
02:04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,...,0.0,0.0,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.004


# 데이터 전처리

In [5]:
## TODO 
## 1.무형 상품은 추정 제외 
## 2.결측된 노출분 채우기 (직전의 값 집어넣기) 
## 3. 판매단가 > 취급액이면 취급액 0 
##

def custom(price, total):
    # 3전처리 
    if price > total: return 0
    else: return total

def pre_processing(df): 
    # 1. 무형 상품 추정 제외 
    df = df.drop(df[df["상품군"]=="무형"].index,axis = 0)
    # 2. 결측된 노출분 채우기 
    for i,row in df.iterrows():
        val = row["노출(분)"]
        if i == 0:
            continue 
        if np.isnan(val):
            if df.loc[i,"방송일시"] == df.loc[i-1,"방송일시"]:
                df.loc[i,"노출(분)"] = df.loc[i-1,"노출(분)"]
            else :
                continue 
                
    # 3. 판매단가 > 취급액이면 취급액 0 
    df["취급액"] = df.apply(lambda x : custom(x["판매단가"], x["취급액"]),axis=1)
    print(sum((sales["판매단가"] > sales["취급액"]) & (sales['취급액'] != 0)))
    # 4. month, day, hour, minute, weekday 열 추가 
    df["방송일시"] = pd.to_datetime(df["방송일시"])
    df['month'] = pd.DatetimeIndex(df['방송일시']).month
    df['day'] = pd.DatetimeIndex(df['방송일시']).day
    df['hour'] = pd.DatetimeIndex(df['방송일시']).hour
    df['minute'] = pd.DatetimeIndex(df['방송일시']).minute
    df['weekday'] = pd.DatetimeIndex(df['방송일시']).weekday
    
    # reset_index
    df = df.reset_index(drop=True)
    
    return df 

sales = pre_processing(sales)
sales.head()

1990


Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1


In [7]:
#  상품명을 기준으로 방송 노출 시간 feature 만들기
sales["방송노출시간"] = sales["노출(분)"].values


name_list = sales["상품명"].value_counts().index

for name in name_list:
    df = sales[sales["상품명"]==name]
    for i,index in enumerate(df.index):
        if i == len(df.index)-1:
            continue
        
        else:   
            next_ = df.index[i+1]
            try:
                if (sales.loc[index,"month"]==sales.loc[next_,"month"]) & (sales.loc[index,"day"]==sales.loc[next_,"day"]) & (sales.loc[index,"hour"]==sales.loc[next_,"hour"]) & (sales.loc[index,"minute"]!=sales.loc[next_,"minute"]):
                    sales.loc[next_,"방송노출시간"] = sales.loc[index,"방송노출시간"]+sales.loc[next_,"노출(분)"]
            except:
                continue
    
sales.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,방송노출시간
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,20.0
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,20.0
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,40.0
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,40.0
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,60.0


## 공휴일 feature 추가 


In [8]:
from datetime import date 
#빨강날~ 
hol= ['2019-01-01'
,'2019-02-04'
,'2019-02-05'
,'2019-02-06'
,'2019-03-01'
,'2019-05-05'
,'2019-05-06'
,'2019-05-12'
,'2019-06-06'
,'2019-08-15'
,'2019-09-12'
,'2019-09-13'
,'2019-09-14'
,'2019-10-03'
,'2019-10-09'
,'2019-12-25',
'2020-01-01']
sales["방송일시"] = pd.to_datetime(sales["방송일시"])
sales["방송날짜"] = sales["방송일시"].apply(lambda x : str(date(x.year,x.month,x.day).isoformat()))
# 1이면 공휴일 아니면 0
sales['holiday']=(sales['방송날짜'].isin(hol)).astype(int)
sales.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,방송노출시간,방송날짜,holiday
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,20.0,2019-01-01,1
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,20.0,2019-01-01,1
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,40.0,2019-01-01,1
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,40.0,2019-01-01,1
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,60.0,2019-01-01,1


In [9]:
# 판매량 feature
sales["판매량"] = sales["취급액"]/sales["판매단가"]

In [12]:
sales.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,방송노출시간,방송날짜,holiday,판매량
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,20.0,2019-01-01,1,52.606516
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,20.0,2019-01-01,1,109.548872
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,40.0,2019-01-01,1,81.754386
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,40.0,2019-01-01,1,174.310777
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,60.0,2019-01-01,1,167.218045
