### 필수 라이브러리 & 데이터 호출

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [3]:
sales_train  = pd.read_csv("data/input/sales_train.csv")
shops = pd.read_csv("data/input/shops.csv")
items = pd.read_csv("data/input/items.csv")
item_cat = pd.read_csv("data/input/item_categories.csv")
test = pd.read_csv("data/input/test.csv")

In [4]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [5]:
shops.head() # shops_id 로 병합 가능

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [6]:
items.head() # items_id 로 병합 가능

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [7]:
item_cat.head() # item_cat_id 로 병합 가능

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [8]:
test.head() # test 데이터는 34개월차의 shops_id, items_id 만 존재, sales_train 과 병합 필요

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 컬럼명 변경

In [9]:
sales_train = sales_train.rename(columns={"date" : "날짜", "date_block_num" : "월ID", "shop_id" : "상점ID", "item_id" : "상품ID" , "item_price" : "판매가", "item_cnt_day" : "판매량" })

shops = shops.rename(columns = {"shop_name" : "상점명" , "shop_id" : "상점ID"})

items = items.rename(columns= {"item_name" : "상품명", "item_id" : "상품ID", "item_category_id" : "상품분류ID"})

item_cat = item_cat.rename(columns= {"item_category_name" : "상품분류명" ,"item_category_id" : "상품분류ID" })

test = test.rename(columns= {"shop_id" : "상점ID", "item_id" : "상품ID"})

In [10]:
test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 데이터 다운 캐스팅

In [11]:

def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum()/ 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        
        if dtype_name == "object" :
            pass
        
        elif dtype_name == "bool" :
            df[col] = df[col].astype('int8')
            
        elif dtype_name == "period[M]":
            pass
        
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast= "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast="float")
    
    end_mem = df.memory_usage().sum()/1024**2
    if verbose:
        print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem)/ start_mem))  # "{:,1f}" 대신 "{:.1f}"를 사용
    return df

In [12]:
all_df = [sales_train, shops, items, item_cat, test]

for df in all_df:
    df = downcast(df)

54.2% 압축됨
38.5% 압축됨
54.2% 압축됨
39.8% 압축됨
70.8% 압축됨


### 데이터 병합 최종 데이터 형태 산출

In [13]:
train = sales_train
train = train.merge(shops, on = "상점ID" , how = "left")
train = train.merge(items, on = "상품ID" , how = "left")
train = train.merge(item_cat, on = "상품분류ID" , how = "left")

train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량,상점명,상품명,상품분류ID,상품분류명
0,02.01.2013,0,59,22154,999.0,1,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,03.01.2013,0,25,2552,899.0,1,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,05.01.2013,0,25,2552,899.0,-1,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
3,06.01.2013,0,25,2554,1709.05,1,"Москва ТРК ""Атриум""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
4,15.01.2013,0,25,2555,1099.0,1,"Москва ТРК ""Атриум""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
