# 1. 데이터 로드

In [36]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format) # 항상 float 형식으로
pd.set_option("display.max_columns",100)

import numpy as np 

# df plot
import cufflinks as cf 
import plotly.plotly as py 
import matplotlib.pyplot as plt 
import seaborn as sns 

# warnings 
import warnings ; warnings.filterwarnings("ignore")

# style
plt.style.use("ggplot")
cf.go_offline()

In [37]:
# 한글 폰트 출력 
import matplotlib
from matplotlib import font_manager, rc 
import platform 

font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc("font",family=font_name)
matplotlib.rcParams['axes.unicode_minus']=False

In [38]:
# 데이터 로드 
path = "C:/Users/sk_jo/Desktop/Project/titanic/"

def load_dataset(): 
    sales = pd.read_excel(path+"sales.xlsx",header =1)
    view_rate = pd.read_excel(path+"view_rate.xlsx",header =1, index_col= 0)
    return sales, view_rate

sales = load_dataset()[0]
view_rate = load_dataset()[1]

# 2. 데이터 전처리 

In [39]:
import datetime 
## TODO 
## 1.무형 상품은 추정 제외 
## 2.결측된 노출분 채우기 (직전의 값 집어넣기) 
## 3. 판매단가 > 취급액이면 취급액 1 
## 4. 취급액이 50000인 경우와 1로 처리
## 해당 경우 판매가 이뤄지지 않아서 해당 값을 부여했다고 했기 떄문에 안팔리는 것은 안팔릴 것이라고 예측을 해줘야 한다.
##  0 일경우 metric이 mape일 때 nan값이 발생하기 떄문에 1로 scaling 
## 5. 시간 데이터 처리
### 



def filling_missing_exposure(df):
    #2번 전처리
    for i,row in df.iterrows():
        val = row["노출(분)"]
        if i == 0:
            continue 
        if np.isnan(val):
            if df.loc[i,"방송일시"] == df.loc[i-1,"방송일시"]:
                df.loc[i,"노출(분)"] = df.loc[i-1,"노출(분)"]
            else :
                continue 
    return df 

def custom(price, total):
    # 3전처리 
    if price > total: return 1
    else: return total
    
def handle_date(df):
    # 5번 전처리
    df["방송일시"] = pd.to_datetime(df["방송일시"])
    df['month'] = pd.DatetimeIndex(df['방송일시']).month
    df['day'] = pd.DatetimeIndex(df['방송일시']).day
    df['hour'] = pd.DatetimeIndex(df['방송일시']).hour
    df['minute'] = pd.DatetimeIndex(df['방송일시']).minute
    df['weekday'] = pd.DatetimeIndex(df['방송일시']).weekday
    
    return df 

def deleting_data(df):
    df = df.replace({"취급액":50000},{"취급액":1})
    df.drop(["마더코드","상품코드"],axis=1,inplace=True)
    return df 
    

def handle_holiday(df):
    # 4번 전처리 
    hol = ['2019-01-01','2019-02-04','2019-02-05','2019-02-06','2019-03-01','2019-05-05','2019-05-06'
           ,'2019-05-12','2019-06-06','2019-08-15','2019-09-12','2019-09-13','2019-09-14','2019-10-03','2019-10-09'
           ,'2019-12-25','2020-01-01']
    df["holiday"] = df["방송일시"].apply(lambda x : 1 if ((datetime.datetime.strftime(x,"%Y-%m-%d")) in hol) else 0 )
    
    return df

    
def pre_processing(df): 
    # 1. 무형 상품 추정 제외 
    df = df.drop(df[df["상품군"]=="무형"].index,axis = 0)
    # 2. 결측된 노출분 채우기 
    df = filling_missing_exposure(df)
                
    # 3. 판매단가 > 취급액이면 취급액 1 
    df["취급액"] = df.apply(lambda x : custom(x["판매단가"], x["취급액"]),axis=1)
    # 4. month, day, hour, minute, weekday 열 추가 
    df = handle_date(df)
    # 5. 공휴일 처리
    df = handle_holiday(df)
    
    # 6. data 삭제 
    df = deleting_data(df)
    
    
    # reset_index
    df = df.reset_index(drop=True)
    
    return df 

sales = pre_processing(sales)
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday
0,2019-01-01 06:00:00,20.0,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,1
1,2019-01-01 06:00:00,20.0,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,1
2,2019-01-01 06:20:00,20.0,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,1
3,2019-01-01 06:20:00,20.0,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,1
4,2019-01-01 06:40:00,20.0,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,1


# 3. 외부 데이터 가지고 오기

## 3-1 날씨 데이터

출처 : 케이웨더 기상청 자료

In [40]:
sales["방송날짜"] = sales["방송일시"].apply(lambda x: datetime.datetime.strftime(x,"%Y-%m-%d"))

In [41]:
weather = pd.read_csv("./data_train/weathers_train.csv")
weather.rename(columns = {"방송일시":"방송날짜"},inplace=True)
weather.head()

Unnamed: 0,방송날짜,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm),미세먼지 주의보 및 경보,평균기온(℃),최고기온(℃),최저기온(℃),season,강수량(mm),폭염주의보,한파주의보,대설주의보,호우주의보
0,2019-01-01,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0
1,2019-01-02,33.08,21.36,0.01,0.04,0.68,0.0,0,-4.9,0.2,-8.8,겨울,0.0,0,1,0,0
2,2019-01-03,38.8,23.36,0.01,0.04,0.74,0.01,0,-3.5,3.2,-8.4,겨울,0.0,0,1,0,0
3,2019-01-04,59.44,40.36,0.0,0.06,1.06,0.01,0,-1.1,4.1,-6.2,겨울,0.0,0,1,0,0
4,2019-01-05,65.72,42.52,0.02,0.03,0.64,0.01,1,-2.8,1.1,-5.5,겨울,0.0,0,1,0,0


In [42]:
weather.rename(columns = {"방송일시":"방송날짜"},inplace=True)

In [43]:
sales = sales.merge(weather,on ="방송날짜",how = "inner")
sales.shape

(37368, 29)

2020년 1월 1일 데이터가 사라진다!

In [44]:
sales.drop(["방송날짜"],axis=1,inplace=True)

## 3-2 카테고리 데이터 

출처 : 쿠팡 검색엔진 서비스

In [45]:
category = pd.read_excel("./data_train/category_train.xlsx",index_col=[0]).reset_index(drop=True)
category.head()

Unnamed: 0,상품명,상품군,cat_1,cat_2,cat_3
0,보루네오 루나 유로탑 멀티수납형 LED 침대 SS 슈퍼싱글,가구,결혼준비,가구/침구,침실가구
1,보루네오 루나 유로탑 멀티수납형 LED 침대 Q 퀸,가구,결혼준비,가구/침구,침실가구
2,보루네오 루나 유로탑 멀티수납형 LED 침대 K 킹,가구,결혼준비,가구/침구,침실가구
3,(일) 삼익가구 LED 제니비 서랍형 침대 SS,가구,결혼준비,가구/침구,침실가구
4,(무) 삼익가구 LED 제니비 서랍형 침대 SS,가구,결혼준비,가구/침구,침실가구


In [46]:
def category_extract(df,product_name):
    try:
        cat = df[df["상품명"]==product_name]["cat_3"].values[0]
    except:
        cat = ''
    return cat

sales["상품군-중"] = sales.apply(lambda x : category_extract(category,x["상품명"]),axis=1)
sales.shape

(37368, 29)

## 3-3 시청률 데이터

출처 : 구글 검색 데이터 + 닐슨 데이터 

In [47]:
view_rate = pd.read_csv("./data_train/view_rate_train.csv",encoding="cp949",index_col=[0])
view_rate = view_rate[["방송일시","viewrate1","viewrate2","viewrate3"]]
view_rate["방송일시"] = pd.to_datetime(view_rate["방송일시"])
view_rate.drop([37368,37369,37370,37371],axis=0,inplace=True)
view_rate.tail()

Unnamed: 0,방송일시,viewrate1,viewrate2,viewrate3
37363,2019-12-31 23:20:00,0,0,0
37364,2019-12-31 23:40:00,0,0,0
37365,2019-12-31 23:40:00,0,0,0
37366,2019-12-31 23:40:00,0,0,0
37367,2019-12-31 23:40:00,0,0,0


In [48]:
print(view_rate.shape)
print(sales.shape)

(37368, 4)
(37368, 29)


In [49]:
for column_name in  view_rate.columns.difference(["방송일시"]):
    sales.loc[:,column_name] = view_rate.loc[:,column_name].values
    
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm),미세먼지 주의보 및 경보,평균기온(℃),최고기온(℃),최저기온(℃),season,강수량(mm),폭염주의보,한파주의보,대설주의보,호우주의보,상품군-중,viewrate1,viewrate2,viewrate3
0,2019-01-01 06:00:00,20.0,테이트 남성 셀린니트3종,의류,39900,2099000.0,1,1,6,0,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0,니트,0,0,0
1,2019-01-01 06:00:00,20.0,테이트 여성 셀린니트3종,의류,39900,4371000.0,1,1,6,0,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0,니트,0,0,0
2,2019-01-01 06:20:00,20.0,테이트 남성 셀린니트3종,의류,39900,3262000.0,1,1,6,20,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0,니트,0,0,0
3,2019-01-01 06:20:00,20.0,테이트 여성 셀린니트3종,의류,39900,6955000.0,1,1,6,20,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0,니트,0,0,0
4,2019-01-01 06:40:00,20.0,테이트 남성 셀린니트3종,의류,39900,6672000.0,1,1,6,40,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,겨울,0.0,0,1,0,0,니트,0,0,0


In [50]:
sales.shape

(37368, 32)

## 3-4 경제지표 데이터

출처 : 한국은행 경제통계시스템

### 개인 신용카드 사용액

In [51]:
economic_indicators = {'month':[1,2,3,4,5,6,7,8,9,10,11,12],
    'ec_credit' :[52039328,45466358,51678679,50723386,53425506,50508891,53730267,52485647,50893769,54017093,53814165, 55909956]}

### 소매 판매액 지수(경기 동행지표) 

In [52]:
economic_indicators["retail_index"] = [109.8,99.8,116.0,111.4,116.6,110.0,110.5,111.4,113.3,115.8,120.5,121.6]

### ns 홈쇼핑 분기 매출액(단위 백만원) 

In [53]:
economic_indicators["sales_by_quarter"] = [117699,117699,117699,124570,124570,124570,124049,124049,124049,131669,131669,131669]

### 경기동행지수순환변동치 

In [54]:
economic_indicators["coincident_index"] = [100.2,99.8,99.7,99.7,99.9,99.9,99.8,99.9,100.0,100.0,99.9,100.2]

In [55]:
#데이터 합치기
ec_df = pd.DataFrame(economic_indicators)
sales = sales.merge(ec_df,on="month")

# 4. Feature engineering 

In [56]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37368 entries, 0 to 37367
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   방송일시              37368 non-null  datetime64[ns]
 1   노출(분)             37368 non-null  float64       
 2   상품명               37368 non-null  object        
 3   상품군               37368 non-null  object        
 4   판매단가              37368 non-null  int64         
 5   취급액               37368 non-null  float64       
 6   month             37368 non-null  int64         
 7   day               37368 non-null  int64         
 8   hour              37368 non-null  int64         
 9   minute            37368 non-null  int64         
 10  weekday           37368 non-null  int64         
 11  holiday           37368 non-null  int64         
 12  미세먼지(㎍/㎥)         37368 non-null  float64       
 13  초미세먼지(㎍/㎥)        37368 non-null  float64       
 14  오존(ppm)           3736

## 4-1. 카테고리 데이터에 대한 처리 

- 상품군, season, 상품군-중에 대한 라벨 인코딩 작업 필요

In [57]:
from sklearn.preprocessing import LabelEncoder

encoder_category = LabelEncoder()
encoder_category.fit(sales["상품군"])
sales["상품군"] = encoder_category.transform(sales["상품군"])
print("상품군 인코딩 클래스",encoder_category.classes_)

encoder_season = LabelEncoder()
encoder_season.fit(sales["season"])
sales["season"] = encoder_season.transform(sales["season"])
print("season 인코딩 클래스",encoder_season.classes_)

encoder_category_2 = LabelEncoder()
encoder_category_2.fit(sales["상품군-중"])
sales["상품군-중"] = encoder_category_2.transform(sales["상품군-중"])
print("상품군-중 인코딩 클래스",encoder_category_2.classes_)


상품군 인코딩 클래스 ['가구' '가전' '건강기능' '농수축' '생활용품' '속옷' '의류' '이미용' '잡화' '주방' '침구']
season 인코딩 클래스 ['가을' '겨울' '봄' '여름']
상품군-중 인코딩 클래스 ['TV' '가구/침구' '가방' '가전디지털' '간편식' '거실가구' '건강식품' '건어물' '공기청정기' '긴소매' '남성속옷'
 '남성스포츠의류' '남성팬티' '남성화장품' '냉장고' '네일' '노트북' '니트' '다이어트식품' '두부/콩나물' '두유'
 '드레스룸' '러그/카페트/거실화' '런닝' '레깅스' '마스크' '머플러/스카프' '메이크업' '모자' '바디' '바지'
 '발열내의' '변기/비데용품' '보정속옷' '뷰티소품' '브라팬티' '브래지어' '블라우스' '설탕/소금/조미료' '세탁기/건조기'
 '세트류' '속옷' '속옷/잠옷' '수납가구' '수산물' '수전용품' '스킨케어' '스포츠/레저' '시계/쥬얼리' '신발'
 '아우터' '아이웨어' '언더셔츠' '에어컨' '에이컨' '여성속옷' '여성스포츠의류' '여행용품' '원피스' '음료제품'
 '의류관리기' '인테리어' '작물' '잡화' '장갑' '조미료' '주방가전' '주방용품' '주방잡화' '지갑/벨트'
 '차량용 생활용품' '청바지' '청소기' '청소용품' '침구세트' '침실가구' '커튼/블라인드' '트레이닝복' '티셔츠'
 '파우더룸' '패딩' '헤어' '헬스/건강용품']


## 4-2 요일별 prime time feature 생성

-> hour, weekday 기준으로 모든 일의 취급액의 평균을 구해서 요일별 시간대별 평균 매출을 가지고 prime time을 구한다.

In [60]:
hour_df

Unnamed: 0,hour,weekday,w_prime
0,0,0,91471780.52
1,0,1,70469061.57
2,0,2,84716898.33
3,0,3,87790255.98
4,0,4,86329191.77
...,...,...,...
142,23,2,126400392.82
143,23,3,115897745.19
144,23,4,130068143.20
145,23,5,112152961.90


In [58]:
process_1 = sales.groupby(["month","day","hour","weekday"])[["취급액"]].sum().reset_index()
hour_df = process_1.groupby(["hour","weekday"]).mean()[["취급액"]].reset_index().rename(columns= {"취급액":"w_prime"})

sales = pd.merge(sales,hour_df, on =["hour","weekday"])
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm),미세먼지 주의보 및 경보,평균기온(℃),최고기온(℃),최저기온(℃),season,강수량(mm),폭염주의보,한파주의보,대설주의보,호우주의보,상품군-중,viewrate1,viewrate2,viewrate3,ec_credit,retail_index,sales_by_quarter,coincident_index,w_prime
0,2019-01-01 06:00:00,20.0,테이트 남성 셀린니트3종,6,39900,2099000.0,1,1,6,0,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,1,0.0,0,1,0,0,17,0,0,0,52039328,109.8,117699,100.2,47716826.92
1,2019-01-01 06:00:00,20.0,테이트 여성 셀린니트3종,6,39900,4371000.0,1,1,6,0,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,1,0.0,0,1,0,0,17,0,0,0,52039328,109.8,117699,100.2,47716826.92
2,2019-01-01 06:20:00,20.0,테이트 남성 셀린니트3종,6,39900,3262000.0,1,1,6,20,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,1,0.0,0,1,0,0,17,0,0,0,52039328,109.8,117699,100.2,47716826.92
3,2019-01-01 06:20:00,20.0,테이트 여성 셀린니트3종,6,39900,6955000.0,1,1,6,20,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,1,0.0,0,1,0,0,17,0,0,0,52039328,109.8,117699,100.2,47716826.92
4,2019-01-01 06:40:00,20.0,테이트 남성 셀린니트3종,6,39900,6672000.0,1,1,6,40,1,1,39.36,26.12,0.01,0.03,0.69,0.0,0,-5.0,-0.6,-8.2,1,0.0,0,1,0,0,17,0,0,0,52039328,109.8,117699,100.2,47716826.92


## 4-3 브랜드와 사이즈 (김찬영) 

In [59]:
sales.to_excel("C:/Users/sk_jo/Desktop/sales_중간.xlsx",encoding= "cp949")

----

# test 데이터 처리 

In [168]:
path = "C:/Users/sk_jo/Desktop/Project/titanic/"

def load_dataset(): 
    sales = pd.read_excel(path+"sales_test.xlsx",header =1)
#     view_rate = pd.read_excel(path+"view_rate.xlsx",header =1, index_col= 0)
    return sales

sales = load_dataset()

import datetime 
## TODO 
## 1.무형 상품은 추정 제외 
## 2.결측된 노출분 채우기 (직전의 값 집어넣기) 
## 3. 판매단가 > 취급액이면 취급액 1 
## 4. 취급액이 50000인 경우와 1로 처리
## 해당 경우 판매가 이뤄지지 않아서 해당 값을 부여했다고 했기 떄문에 안팔리는 것은 안팔릴 것이라고 예측을 해줘야 한다.
##  0 일경우 metric이 mape일 때 nan값이 발생하기 떄문에 1로 scaling 
## 5. 시간 데이터 처리
### 



def filling_missing_exposure(df):
    #2번 전처리
    for i,row in df.iterrows():
        val = row["노출(분)"]
        if i == 0:
            continue 
        if np.isnan(val):
            if df.loc[i,"방송일시"] == df.loc[i-1,"방송일시"]:
                df.loc[i,"노출(분)"] = df.loc[i-1,"노출(분)"]
            else :
                continue 
    return df 

def custom(price, total):
    # 3전처리 
    if price > total: return 1
    else: return total
    
def handle_date(df):
    # 5번 전처리
    df["방송일시"] = pd.to_datetime(df["방송일시"])
    df['month'] = pd.DatetimeIndex(df['방송일시']).month
    df['day'] = pd.DatetimeIndex(df['방송일시']).day
    df['hour'] = pd.DatetimeIndex(df['방송일시']).hour
    df['minute'] = pd.DatetimeIndex(df['방송일시']).minute
    df['weekday'] = pd.DatetimeIndex(df['방송일시']).weekday
    
    return df 

def deleting_data(df):
    df = df.replace({"취급액":50000},{"취급액":1})
    df.drop(["마더코드","상품코드"],axis=1,inplace=True)
    return df 
    

def handle_holiday(df):
    # 4번 전처리 
    hol = ['2020-06-06']
    df["holiday"] = df["방송일시"].apply(lambda x : 1 if ((datetime.datetime.strftime(x,"%Y-%m-%d")) in hol) else 0 )
    
    return df

    
def pre_processing(df): 
    # 1. 무형 상품 추정 제외 
    df = df.drop(df[df["상품군"]=="무형"].index,axis = 0)
    # 2. 결측된 노출분 채우기 
    df = filling_missing_exposure(df)
                
    # 4. month, day, hour, minute, weekday 열 추가 
    df = handle_date(df)
    # 5. 공휴일 처리
    df = handle_holiday(df)
    
    # 6. data 삭제 
    df = deleting_data(df)
    
    
    # reset_index
    df = df.reset_index(drop=True)
    
    return df 

sales = pre_processing(sales)
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday
0,2020-06-01 06:20:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,6,20,0,0
1,2020-06-01 06:40:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,6,40,0,0
2,2020-06-01 07:00:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,7,0,0,0
3,2020-06-01 07:20:00,20.0,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,6,1,7,20,0,0
4,2020-06-01 07:40:00,20.0,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,6,1,7,40,0,0


In [169]:
print(sales.shape)

(2716, 12)


# 기온 데이터

In [170]:
sales["방송날짜"] = sales["방송일시"].apply(lambda x: datetime.datetime.strftime(x,"%Y-%m-%d"))
weather = pd.read_excel("./data_test/weathers_test.xlsx")
weather.rename(columns = {"측정일자":"방송날짜"},inplace=True)
# weather["방송날짜"] = pd.to_datetime(weather["방송날짜"])
weather["방송날짜"] = weather["방송날짜"].apply(lambda x :  datetime.datetime.strftime(x,"%Y-%m-%d"))
sales = sales.merge(weather,on ="방송날짜",how = "inner")
print(sales.shape)
sales.drop(["방송날짜"],axis=1,inplace=True)

(2716, 29)


# 카테고리 데이터

In [171]:
category = pd.read_excel("./data_test/category_test.xlsx",index_col=[0]).reset_index(drop=True)
category.head()

Unnamed: 0,상품명,상품군,cat_1,cat_2,cat_3
0,잭필드 남성 반팔셔츠 4종,의류,남성패션,셔츠,캐주얼
1,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,여성패션,속옷/잠옷,보정속옷
2,바비리스 퍼펙트 볼륨스타일러,이미용,가전디지털,이미용가전,헤어
3,램프쿡 자동회전냄비,주방,결혼준비,주방용품,주방용품
4,벨레즈온 심리스 원피스 4종 패키지,속옷,여성패션,속옷/잠옷,잠옷


In [172]:
def category_extract(df,product_name):
    try:
        cat = df[df["상품명"]==product_name]["cat_3"].values[0]
    except:
        cat = ''
    return cat

sales["상품군-중"] = sales.apply(lambda x : category_extract(category,x["상품명"]),axis=1)
sales.shape

(2716, 29)

# 시청률 데이터

In [173]:
view_rate = pd.read_csv("./data_test/view_rate_test.csv",encoding="cp949",index_col=[0])
view_rate = view_rate[["방송일시","viewrate1","viewrate2","viewrate3"]]
view_rate["방송일시"] = pd.to_datetime(view_rate["방송일시"])
view_rate.tail()

Unnamed: 0,방송일시,viewrate1,viewrate2,viewrate3
2711,2020-07-01 00:10:00,0,0,0
2712,2020-07-01 00:10:00,0,0,0
2713,2020-07-01 00:10:00,0,0,0
2714,2020-07-01 01:20:00,0,0,0
2715,2020-07-01 01:40:00,0,0,0


In [174]:
for column_name in  view_rate.columns.difference(["방송일시"]):
    sales.loc[:,column_name] = view_rate.loc[:,column_name].values
    
sales.head()

Unnamed: 0,방송일시,노출(분),상품명,상품군,판매단가,취급액,month,day,hour,minute,weekday,holiday,미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm),초미세먼지(㎍/㎥),주의보 및 경보,평균기온(℃),최고기온(℃),최저기온(℃),season,강수량(mm),폭염주의보,한파주의보,대설주의보,호우주의보,상품군-중,viewrate1,viewrate2,viewrate3
0,2020-06-01 06:20:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,6,20,0,0,19,0.04,0.01,0.3,0.0,9,0,19.7,24.5,16.6,봄,0.4,0,0,0,0,캐주얼,0,0,0
1,2020-06-01 06:40:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,6,40,0,0,19,0.04,0.01,0.3,0.0,9,0,19.7,24.5,16.6,봄,0.4,0,0,0,0,캐주얼,0,0,0
2,2020-06-01 07:00:00,20.0,잭필드 남성 반팔셔츠 4종,의류,59800,,6,1,7,0,0,0,19,0.04,0.01,0.3,0.0,9,0,19.7,24.5,16.6,봄,0.4,0,0,0,0,캐주얼,0,0,0
3,2020-06-01 07:20:00,20.0,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,6,1,7,20,0,0,19,0.04,0.01,0.3,0.0,9,0,19.7,24.5,16.6,봄,0.4,0,0,0,0,보정속옷,0,0,0
4,2020-06-01 07:40:00,20.0,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,6,1,7,40,0,0,19,0.04,0.01,0.3,0.0,9,0,19.7,24.5,16.6,봄,0.4,0,0,0,0,보정속옷,0,0,0


In [175]:
sales.shape

(2716, 32)

# 경제 지표 데이터

-> 아직 안올라와서 올라오면 집어넣겠습니다.

# 카테고리 데이터 처리

In [176]:
# from sklearn.preprocessing import LabelEncoder

# encoding_list = {"상품군":['가구','가전','건강기능','농수축','생활용품','속옷','의류','이미용','잡화','주방','침구'],
#                  "season":['가을','겨울','봄','여름'],"상품군-중":['TV','가구/침구','가방','가전디지털','간편식','거실가구','건강식품',
#                                                         '건어물','공기청정기', '긴소매', '남성속옷', '남성스포츠의류', '남성팬티',
#                                                         '남성화장품', '냉장고', '네일', '노트북', '니트', '다이어트식품', 
#                                                         '두부/콩나물', '두유','드레스룸', '러그/카페트/거실화', '런닝',
#                                                         '레깅스', '마스크', '머플러/스카프', '메이크업', '모자', '바디' ,
#                                                         '바지', '발열내의', '변기/비데용품', '보정속옷', '뷰티소품',
#                                                         '브라팬티', '브래지어', '블라우스', '설탕/소금/조미료', '세탁기/건조기',
#                                                          '세트류' ,'속옷' ,'속옷/잠옷', '수납가구', '수산물', '수전용품',
#                                                         '스킨케어', '스포츠/레저', '시계/쥬얼리', '신발', '아우터', '아이웨어',
#                                                         '언더셔츠', '에어컨', '에이컨' ,'여성속옷', '여성스포츠의류', '여행용품',
#                                                         '원피스' ,'음료제품', '의류관리기', '인테리어' ,'작물' ,'잡화', '장갑',
#                                                         '조미료', '주방가전' ,'주방용품', '주방잡화', '지갑/벨트', '차량용 생활용품',
#                                                         '청바지', '청소기', '청소용품', '침구세트' ,'침실가구', '커튼/블라인드' ,
#                                                         '트레이닝복', '티셔츠', '파우더룸' ,'패딩', '헤어','헬스/건강용품']}
# for label in encoding_list.keys():
    
#     encoder = LabelEncoder()
#     encoder.fit(encoding_list[label])
    
#     for 
#     sales[label] = encoder.transform(sales[label])
    
# sales.info()

ValueError: y contains previously unseen labels: '캐주얼'

In [178]:
sales.to_excel("C:/Users/sk_jo/Desktop/sales_test_중간.xlsx",encoding= "cp949")