### 필수 라이브러리 & 데이터 호출

In [None]:
import pandas as pd
import numpy as np

In [74]:
sales_train  = pd.read_csv("data/input/sales_train.csv")
shops = pd.read_csv("data/input/shops.csv")
items = pd.read_csv("data/input/items.csv")
item_cat = pd.read_csv("data/input/item_categories.csv")
test = pd.read_csv("data/input/test.csv")

In [None]:
def downcast(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        
        if dtype_name == 'object':
            if df[col].nunique() / df[col].shape[0] < 0.5:
                df[col] = df[col].astype('category')
        
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
            
        elif dtype_name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast='integer')
            
        elif dtype_name.startswith('float'):
            if (df[col].round() == df[col]).all():
                 df[col] = pd.to_numeric(df[col], downcast='integer')
            else:
                 df[col] = pd.to_numeric(df[col], downcast='float')
                 
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f"메모리 사용량: {start_mem:.2f} MB -> {end_mem:.2f} MB")
        print(f"감소율: {(start_mem - end_mem) / start_mem * 100:.1f}%")
        
    return df


In [76]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [77]:
sales_train['date'] = pd.to_datetime(sales_train['date'], dayfirst=True)

In [78]:
shops.head() # shops_id 로 병합 가능

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [79]:
items.head() # items_id 로 병합 가능

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [80]:
item_cat.head() # item_cat_id 로 병합 가능

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [81]:
test.head() # test 데이터는 34개월차의 shops_id, items_id 만 존재, sales_train 과 병합 필요

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [82]:
df = [sales_train, shops, items, item_cat, test ]
for i in df:
    i = downcast(i)

메모리 사용량: 134.39 MB -> 61.60 MB
감소율: 54.2%
메모리 사용량: 0.00 MB -> 0.00 MB
감소율: 38.5%
메모리 사용량: 0.51 MB -> 0.23 MB
감소율: 54.2%
메모리 사용량: 0.00 MB -> 0.00 MB
감소율: 39.8%
메모리 사용량: 4.90 MB -> 1.43 MB
감소율: 70.8%


### 컬럼명 변경

In [83]:
sales_train = sales_train.rename(columns={"date" : "날짜", "date_block_num" : "월ID", "shop_id" : "상점ID", "item_id" : "상품ID" , "item_price" : "판매가", "item_cnt_day" : "판매량" })

shops = shops.rename(columns = {"shop_name" : "상점명" , "shop_id" : "상점ID"})

items = items.rename(columns= {"item_name" : "상품명", "item_id" : "상품ID", "item_category_id" : "상품분류ID"})

item_cat = item_cat.rename(columns= {"item_category_name" : "상품분류명" ,"item_category_id" : "상품분류ID" })

test = test.rename(columns= {"shop_id" : "상점ID", "item_id" : "상품ID"})

In [84]:
test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 데이터 병합 최종 데이터 형태 산출

In [92]:
print("\n날짜 범위:")
print("시작:", sales_train['날짜'].min())
print("종료:", sales_train['날짜'].max())

날짜 컬럼 데이터 타입: datetime64[ns]

날짜 범위:
시작: 2013-01-01 00:00:00
종료: 2015-10-31 00:00:00


In [86]:
train = sales_train
train = train.merge(shops, on = "상점ID" , how = "left")
train = train.merge(items, on = "상품ID" , how = "left")
train = train.merge(item_cat, on = "상품분류ID" , how = "left")

test = test.merge(shops, on = "상점ID" , how = "left")
test = test.merge(items, on = "상품ID" , how = "left")
test = test.merge(item_cat, on = "상품분류ID" , how = "left")

In [87]:
train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량,상점명,상품명,상품분류ID,상품분류명
0,2013-01-02,0,59,22154,999.0,1,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,2013-01-03,0,25,2552,899.0,1,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,2013-01-05,0,25,2552,899.0,-1,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
3,2013-01-06,0,25,2554,1709.05,1,"Москва ТРК ""Атриум""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
4,2013-01-15,0,25,2555,1099.0,1,"Москва ТРК ""Атриум""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства


In [88]:
test.head()

Unnamed: 0,ID,상점ID,상품ID,상점명,상품명,상품분류ID,상품분류명
0,0,5,5037,"Вологда ТРЦ ""Мармелад""","NHL 15 [PS3, русские субтитры]",19,Игры - PS3
1,1,5,5320,"Вологда ТРЦ ""Мармелад""",ONE DIRECTION Made In The A.M.,55,Музыка - CD локального производства
2,2,5,5233,"Вологда ТРЦ ""Мармелад""","Need for Speed Rivals (Essentials) [PS3, русск...",19,Игры - PS3
3,3,5,5232,"Вологда ТРЦ ""Мармелад""","Need for Speed Rivals (Classics) [Xbox 360, ру...",23,Игры - XBOX 360
4,4,5,5268,"Вологда ТРЦ ""Мармелад""","Need for Speed [PS4, русская версия]",20,Игры - PS4


In [89]:
train["월ID"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
      dtype=int8)

In [90]:
test["월ID"] = 34
test.head()

Unnamed: 0,ID,상점ID,상품ID,상점명,상품명,상품분류ID,상품분류명,월ID
0,0,5,5037,"Вологда ТРЦ ""Мармелад""","NHL 15 [PS3, русские субтитры]",19,Игры - PS3,34
1,1,5,5320,"Вологда ТРЦ ""Мармелад""",ONE DIRECTION Made In The A.M.,55,Музыка - CD локального производства,34
2,2,5,5233,"Вологда ТРЦ ""Мармелад""","Need for Speed Rivals (Essentials) [PS3, русск...",19,Игры - PS3,34
3,3,5,5232,"Вологда ТРЦ ""Мармелад""","Need for Speed Rivals (Classics) [Xbox 360, ру...",23,Игры - XBOX 360,34
4,4,5,5268,"Вологда ТРЦ ""Мармелад""","Need for Speed [PS4, русская версия]",20,Игры - PS4,34


In [None]:
train.to_csv("data/output/rawData/train.csv", index = False)
test.to_csv("data/output/rawData/test.csv", index = False)