# 1. 데이터 로드

In [12]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format) # 항상 float 형식으로
pd.set_option("display.max_columns",100)

import numpy as np 

# df plot
import cufflinks as cf 
import plotly.plotly as py 
import matplotlib.pyplot as plt 
import seaborn as sns 

# warnings 
import warnings ; warnings.filterwarnings("ignore")

# style
plt.style.use("ggplot")
cf.go_offline()

In [2]:
# 한글 폰트 출력 
import matplotlib
from matplotlib import font_manager, rc 
import platform 

font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc("font",family=font_name)
matplotlib.rcParams['axes.unicode_minus']=False

In [3]:
# 데이터 로드 
path = "C:/Users/sk_jo/Desktop/Project/titanic/"

def load_dataset(): 
    sales = pd.read_excel(path+"sales.xlsx",header =1)
    view_rate = pd.read_excel(path+"view_rate.xlsx",header =1, index_col= 0)
    return sales, view_rate

sales = load_dataset()[0]
view_rate = load_dataset()[1]

# 2. 데이터 전처리 

In [4]:
import datetime 
## TODO 
## 1.무형 상품은 추정 제외 
## 2.결측된 노출분 채우기 (직전의 값 집어넣기) 
## 3. 판매단가 > 취급액이면 취급액 0 
## 4. 시간 데이터 처리
## 5. 취급액이 50000인경우와 0인 경우 삭제 


def filling_missing_exposure(df):
    #2번 전처리
    for i,row in df.iterrows():
        val = row["노출(분)"]
        if i == 0:
            continue 
        if np.isnan(val):
            if df.loc[i,"방송일시"] == df.loc[i-1,"방송일시"]:
                df.loc[i,"노출(분)"] = df.loc[i-1,"노출(분)"]
            else :
                continue 
    return df 

def custom(price, total):
    # 3전처리 
    if price > total: return 0
    else: return total
    
def handle_date(df):
    # 4번 전처리
    df["방송일시"] = pd.to_datetime(df["방송일시"])
    df['month'] = pd.DatetimeIndex(df['방송일시']).month
    df['day'] = pd.DatetimeIndex(df['방송일시']).day
    df['hour'] = pd.DatetimeIndex(df['방송일시']).hour
    df['minute'] = pd.DatetimeIndex(df['방송일시']).minute
    df['weekday'] = pd.DatetimeIndex(df['방송일시']).weekday
    
    return df 

def deleting_data(df):
    df.drop(df[df["취급액"]==50000].index,axis=0,inplace=True)
    df.drop(df[df["취급액"]==0].index,axis=0,inplace=True)
    df.drop(["마더코드","상품코드"],axis=1,inplace=True)
    return df 
    

def handle_holiday(df):
    # 5번 전처리 
    hol = ['2019-01-01','2019-02-04','2019-02-05','2019-02-06','2019-03-01','2019-05-05','2019-05-06'
           ,'2019-05-12','2019-06-06','2019-08-15','2019-09-12','2019-09-13','2019-09-14','2019-10-03','2019-10-09'
           ,'2019-12-25','2020-01-01']
    df["holiday"] = df["방송일시"].apply(lambda x : 1 if ((datetime.datetime.strftime(x,"%Y-%m-%d")) in hol) else 0 )
    
    return df

    
def pre_processing(df): 
    # 1. 무형 상품 추정 제외 
    df = df.drop(df[df["상품군"]=="무형"].index,axis = 0)
    # 2. 결측된 노출분 채우기 
    df = filling_missing_exposure(df)
                
    # 3. 판매단가 > 취급액이면 취급액 0 
    df["취급액"] = df.apply(lambda x : custom(x["판매단가"], x["취급액"]),axis=1)
    # 4. month, day, hour, minute, weekday 열 추가 
    df = handle_date(df)
    # 5. 공휴일 처리
    df = handle_holiday(df)
    
    # 6. data 삭제 
    df = deleting_data(df)
    
    #7. 판매량 
    df["판매량"] = df["취급액"]/df["판매단가"]
    
    # reset_index
    df = df.reset_index(drop=True)
    
    return df 

sales = pre_processing(sales)
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday,판매량
0,2019-01-01 06:00:00,20.0,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,1,52.61
1,2019-01-01 06:00:00,20.0,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,1,109.55
2,2019-01-01 06:20:00,20.0,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,1,81.75
3,2019-01-01 06:20:00,20.0,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,1,174.31
4,2019-01-01 06:40:00,20.0,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,1,167.22


# 3. 외부 데이터 가지고 오기

## 3-1 날씨 데이터

In [6]:
weather = pd.read_csv("./weathers_train.csv")
weather.head()

Unnamed: 0,방송일시,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm),미세먼지 주의보 및 경보,평균기온(℃),최고기온(℃),최저기온(℃),season,강수량(mm),폭염주의보,한파주의보,대설주의보,호우주의보
0,2019-01-01,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0
1,2019-01-02,33.08,21.36,0.01,0.04,0.68,0.0,0,-4.9,0.2,-8.8,겨울,0.0,0,1,0,0
2,2019-01-03,38.8,23.36,0.01,0.04,0.74,0.01,0,-3.5,3.2,-8.4,겨울,0.0,0,1,0,0
3,2019-01-04,59.44,40.36,0.0,0.06,1.06,0.01,0,-1.1,4.1,-6.2,겨울,0.0,0,1,0,0
4,2019-01-05,65.72,42.52,0.02,0.03,0.64,0.01,1,-2.8,1.1,-5.5,겨울,0.0,0,1,0,0


## 3-2 카테고리 데이터 

## 3-3 시청률 데이터

## 3-4 경제지표 데이터

# 4. Feature engineering 

## 4-1. 상품군 카테고리 데이터

In [11]:
# pd.get_dummies(sales["상품군"],prefix = "상품군")
sales = pd.concat([sales,pd.get_dummies(sales["상품군"],prefix = "상품군")],axis=1)
sales.head()



Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,...,상품군_가전,상품군_건강기능,상품군_농수축,상품군_생활용품,상품군_속옷,상품군_의류,상품군_이미용,상품군_잡화,상품군_주방,상품군_침구
0,2019-01-01 06:00:00,20.0,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,...,0,0,0,0,0,1,0,0,0,0
1,2019-01-01 06:00:00,20.0,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,...,0,0,0,0,0,1,0,0,0,0
2,2019-01-01 06:20:00,20.0,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,...,0,0,0,0,0,1,0,0,0,0
3,2019-01-01 06:20:00,20.0,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,...,0,0,0,0,0,1,0,0,0,0
4,2019-01-01 06:40:00,20.0,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,...,0,0,0,0,0,1,0,0,0,0


In [69]:
sales.groupby(["hour"]).count()["노출(분)"]

hour
0     1407
1     1498
2       70
6     1329
7     1460
8     1599
9     1553
10    1875
11    1823
12    1746
13    1649
14    1654
15    1592
16    1372
17    1350
18    1162
19    1746
20    2262
21    3013
22    3153
23    2066
Name: 노출(분), dtype: int64

In [32]:
sales["판매량"] = sales["취급액"]/sales["판매단가"]
sales = sales[["판매량","취급액","weekday","방송일시","방송날짜","holiday"]]
weather = weather[["방송일시","season"]]

__상품명이 같은데 판매단가가 다른 경우를 확인해서 price adjustment가 있는지 없는지를 확인__

In [16]:
sales.groupby(["상품명","판매단가"]).count()[["month"]].

Unnamed: 0_level_0,Unnamed: 1_level_0,month
상품명,판매단가,Unnamed: 2_level_1
뉴트리원 다이어트 구미,59000,60
[가이거] 블랙 에디션 다이아몬드 워치(남성용),178000,3
[가이거] 블랙 에디션 다이아몬드 워치(여성용),178000,3
[맛있는 제주]손질 생선 3대세트+흑돼지구이(광어+갈치+고등어,59900,3
국내제조 오슬로 IH 프라이팬 세트,99000,12
...,...,...
효재 화원 자수 광목워싱 침구세트 킹,179000,3
히트융 극세사 퍼융기모 남성 동내의 패키지,49900,3
히트융 극세사 퍼융기모 남성 동내의 패키지,69900,42
히트융 극세사 퍼융기모 여성 동내의 패키지,49900,3


In [35]:
new_df = sales[["상품명","판매단가"]]
new_df

Unnamed: 0,상품명,판매단가
0,테이트 남성 셀린니트3종,39900
1,테이트 여성 셀린니트3종,39900
2,테이트 남성 셀린니트3종,39900
3,테이트 여성 셀린니트3종,39900
4,테이트 남성 셀린니트3종,39900
...,...,...
35374,일시불쿠첸압력밥솥 6인용,148000
35375,무이자쿠첸압력밥솥 10인용,178000
35376,일시불쿠첸압력밥솥 10인용,168000
35377,무이자쿠첸압력밥솥 6인용,158000


In [45]:

name_dict= {}
change_dict={}
for i,row in sales.iterrows():
    product_name = row["상품명"]
    product_price = row["판매단가"]
    if (product_name in name_dict.keys()) :
        if  (product_price >= name_dict[product_name]):
            sales.loc[i,"price_adj여부"] = 0 
            continue
        else : 
            sales.loc[i,"price_adj여부"] = 1
            change_dict[product_name] = product_price

    else : 
        sales.loc[i,"price_adj여부"] = 0 
        name_dict[product_name] = product_price


sales.groupby(["price_adj여부"]).sum()["판매량"].iplot(kind="bar")
sales.groupby(["price_adj여부"]).std()["판매량"].iplot(kind="bar")

In [52]:
sales.groupby(["price_adj여부"]).mean()["판매량"].iplot(kind="bar")
sales.groupby(["price_adj여부"]).std()["판매량"].iplot(kind="bar")

In [None]:
cha

In [65]:
sales[sales["상품명"]=="에펨 여성 미들퍼부츠"][["판매단가","판매량"]].groupby(["판매단가"]).sum()["판매량"].iplot(kind="bar")

In [55]:
change_dict

{'천수봉명인 선재 전통 메주세트': 96000,
 '마리노블 밍크 롱코트': 399000,
 '에펨 여성 미들퍼부츠': 29800,
 '에펨 남성 미들퍼부츠': 39800,
 '보코 리버시블 무스탕': 69000,
 '메시제이 퀼팅코트+퍼베스트': 59000,
 '보코 폭스퍼구스다운 롱패딩': 159000,
 'K-SWISS 남성약기모팬츠3종': 49900,
 '보코 니트웨어 풀코디': 49000,
 '무이자 LG 울트라HD TV 55UK6800HNC': 1330000,
 '일시불 LG 울트라HD TV 65UK6800HNC': 1690000,
 '무이자 LG 울트라HD TV 65UK6800HNC': 1920000,
 '일시불 LG 울트라HD TV 70UK7400KNA': 2400000,
 '무이자 LG 울트라HD TV 70UK7400KNA': 2690000,
 '그렉노먼 남성 봄 컬렉션 티셔츠 5종': 49900,
 '그렉노먼 여성 봄 컬렉션 티셔츠 5종': 49900,
 '엔셀라두스 밴딩팬츠 3종': 29000,
 'CERINI by PAT 남성 어반 카라 티셔츠 4종': 59900,
 'K-SWISS 남성에어핏트랙수트': 79900,
 'K-SWISS 여성에어핏트랙수트': 79900,
 '쿠미투니카 퍼펙트클린 레이스 브라팬티': 89900,
 'NNF SS트레이닝 세트': 39000,
 '오모떼 리얼스킨 쉐이핑 브라팬티': 79000,
 'K-SWISS 남성이너티셔츠5종': 59900,
 'K-SWISS 여성이너티셔츠5종': 59900,
 'CERINI by PAT 남성 캐주얼 셔츠 4종': 59900,
 '스텔라테일러 컴포트 스트레치 팬츠 3종': 29800,
 '[안드레아바나]리얼카이만 악어 바디 숄더백': 158000,
 '[안드레아바나]리얼카이만 악어 테일 숄더백': 158000,
 '네페르티티 유기농 퀸 석류즙 6박스': 77000,
 '크로커다일 감탄 레이스 브라렛 패키지': 69900,
 '헤드 파이로라이트 드로즈 10종 패키지': 69000

In [62]:
new_df = pd.merge(sales,weather,how = "inner", on = "방송날짜")

In [63]:
new_df["방송시간"] = new_df["방송일시"].apply(lambda x : datetime.datetime.strftime(x,"%H:%M"))
new_df.head()

Unnamed: 0,판매량,취급액,weekday,방송일시,방송날짜,holiday,season,방송시간
0,52.61,2099000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00
1,109.55,4371000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00
2,81.75,3262000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20
3,174.31,6955000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20
4,167.22,6672000.0,1,2019-01-01 06:40:00,2019-01-01,1,겨울,06:40


In [64]:
#쉬는날과 안쉬는날의 데이터로 만들어 봅니다.
def check_off(weekday, holiday):
    if (weekday in [5,6] ) or (holiday == 1) : 
        return 1 
    else:
        return 0 
    
new_df["Day_off"] = new_df.apply(lambda x: check_off(x["weekday"],x["holiday"]) ,axis=1)
new_df.head()    

Unnamed: 0,판매량,취급액,weekday,방송일시,방송날짜,holiday,season,방송시간,Day_off
0,52.61,2099000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1
1,109.55,4371000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1
2,81.75,3262000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1
3,174.31,6955000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1
4,167.22,6672000.0,1,2019-01-01 06:40:00,2019-01-01,1,겨울,06:40,1


season별 주중 주말 데이터의 시간대별 판매량 및 취급액 차이

In [65]:
new_df["hour"] = new_df["방송시간"].apply(lambda x:x.split(":")[0] )
new_df.head()

Unnamed: 0,판매량,취급액,weekday,방송일시,방송날짜,holiday,season,방송시간,Day_off,hour
0,52.61,2099000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1,6
1,109.55,4371000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1,6
2,81.75,3262000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1,6
3,174.31,6955000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1,6
4,167.22,6672000.0,1,2019-01-01 06:40:00,2019-01-01,1,겨울,06:40,1,6


In [158]:
def visualize(df,season, y = "판매량"):
    # new_df 
    # season이름 
    df = df[df["season"]==season]
    day_on = df[df["Day_off"]==0]
    day_off = df[df["Day_off"]==1]
    
    
    day_on.groupby(["hour"]).sum()[y].iplot(kind = "bar",title= "{}의 쉬지 않는날 시간별 {} 총합".format(season,y))
    day_off.groupby(["hour"]).sum()[y].iplot(kind = "bar",colors = "blue",title= "{}의 쉬는날 시간별 {} 총합".format(season,y))
    
    return  day_on.groupby(["hour"]).sum()[y], day_off.groupby(["hour"]).sum()[y]
        

day_on, day_off = visualize(new_df,"가을",y="취급액")

In [154]:
day_on, day_off = visualize(new_df,"겨울",y="취급액")

In [49]:
pd.DataFrame(day_on, index = day_on.index, columns = ["취급액"]).sort_values(by="취급액",ascending=False)

Unnamed: 0_level_0,취급액
hour,Unnamed: 1_level_1
21,11062073000.0
20,9929959000.0
22,9537033000.0
23,8039051000.0
19,7879312000.0
18,7193680000.0
17,7053369000.0
16,6960652000.0
11,6888576000.0
10,6869511000.0


In [84]:
def visualize_ratio(df,season, y = "판매량"):
    # new_df 
    # season이름 
    df = df[df["season"]==season]
    day_on = df[df["Day_off"]==0]
    day_off = df[df["Day_off"]==1]
    
    total_day_on =  day_on.groupby(["hour"]).sum()[y].sum()
    total_day_off = day_off.groupby(["hour"]).sum()[y].sum()
    
    (day_on.groupby(["hour"]).sum()[y]/total_day_on).iplot(kind = "bar",title= "{}의 주중 시간별 {} 총합".format(season,y))
    (day_off.groupby(["hour"]).sum()[y]/total_day_off).iplot(kind = "bar",colors = "blue",title= "{}의 주말 시간별 {} 총합".format(season,y))
    
    return  day_on.groupby(["hour"]).sum()[y], day_off.groupby(["hour"]).sum()[y]
        

day_on, day_off = visualize_ratio(new_df,"겨울",y="취급액")

In [52]:
pd.DataFrame(day_on, index = day_on.index, columns = ["판매량"]).sort_values(by="판매량",ascending=False)

Unnamed: 0_level_0,판매량
hour,Unnamed: 1_level_1
17,100686.9
18,88702.23
16,87447.69
11,78470.99
19,77719.99
10,70340.79
13,60601.58
20,59892.5
9,59842.47
15,56922.43


In [139]:
new_df = new_df[new_df["season"].isin(["겨울","가을"])].reset_index(drop=True)

In [140]:
new_df.drop(new_df[new_df["취급액"]==0].index,axis=0,inplace=True)
new_df.drop(new_df[new_df["취급액"]==50000].index,axis=0,inplace=True)
new_df[new_df["취급액"]==0]

Unnamed: 0,판매량,취급액,weekday,방송일시,방송날짜,holiday,season,방송시간,Day_off,hour,Prime


In [141]:
def prime_or_not(season, hour, day_off):
    
    if season =="가을":
        if day_off == 1:
            #휴일인경우
            if hour in ["16","17","21","22"]:
                return 1 
            else :
                return 0 
        else: 
            #아닌경우
            if hour in ["20","21","22"]:
                return 1 
            else : 
                return 0
            
    elif season == "겨울":
        if day_off == 1:
            #휴일인경우
            if hour in ["16","17","21","22"]:
                return 1
            else : 
                return 0 
            
        else: 
            #아닌경우
            if hour in ["20","21","22"]:
                return 1 
            else: 
                return 0 

new_df["Prime"] = new_df.apply(lambda x : prime_or_not(x["season"],x["hour"],x["Day_off"]),axis=1)
new_df.head()

Unnamed: 0,판매량,취급액,weekday,방송일시,방송날짜,holiday,season,방송시간,Day_off,hour,Prime
0,52.61,2099000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1,6,0
1,109.55,4371000.0,1,2019-01-01 06:00:00,2019-01-01,1,겨울,06:00,1,6,0
2,81.75,3262000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1,6,0
3,174.31,6955000.0,1,2019-01-01 06:20:00,2019-01-01,1,겨울,06:20,1,6,0
4,167.22,6672000.0,1,2019-01-01 06:40:00,2019-01-01,1,겨울,06:40,1,6,0


In [156]:
import seaborn as sns 

new_df.groupby(["season","Prime"]).mean()["취급액"].iplot(kind="bar")
new_df.groupby(["season","Prime"]).std()["취급액"].iplot(kind="bar",color = "blue")

In [165]:
new_df.groupby(["weekday","hour"]).describe()["취급액"].to_csv("연습.xlsx")