# Data fields
- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
- item_price - current price of an item
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
- item_name - name of item
- shop_name - name of shop
- item_category_name - name of item category

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
items = pd.read_csv("datasets/items.csv") # supplemental information about the items/products.
item_categories = pd.read_csv("datasets/item_categories.csv") # supplemental information about the items categories.
shops = pd.read_csv("datasets/shops.csv") # supplemental information about the shops.
samples = pd.read_csv("datasets/sample_submission.csv.gz")

In [3]:
train_set = pd.read_csv("datasets/sales_train.csv.gz")
train_set.shape

(2935849, 6)

In [4]:
test_set = pd.read_csv("datasets/test.csv.gz")
test_set.shape

(214200, 3)

In [5]:
# train_set의 item_id를 item_category로 수합하고 일별 count를 월별로 바꿈
merged = pd.merge(train_set, items.loc[:,['item_id', 'item_category_id']], on='item_id').loc[:,['date_block_num', 'shop_id', 'item_category_id', 'item_price', 'item_cnt_day']]
grouped = merged.groupby(['shop_id', 'item_category_id', 'date_block_num', 'item_price'])['item_cnt_day'].sum().reset_index().rename(columns={'item_cnt_day':'item_cnt_month'})
# test_set의 item_id를 item_category로 수합
merged2 = pd.merge(test_set, items, on='item_id').loc[:,['ID', 'shop_id', 'item_id', 'item_category_id']]
grouped2 = merged2.groupby(['shop_id', 'item_category_id']).count()

In [6]:
# 상점과 아이템 카테고리 아이디를 범주형으로
grouped['shop_id'] = grouped.shop_id.astype('category')
grouped['item_category_id'] = grouped.item_category_id.astype('category')
# shop id와 item category id를 dummy 변수화
grouped_dm = pd.get_dummies(grouped, columns=['shop_id', 'item_category_id'])
grouped_dm

Unnamed: 0,date_block_num,item_price,item_cnt_month,shop_id_0,shop_id_1,shop_id_2,shop_id_3,shop_id_4,shop_id_5,shop_id_6,...,item_category_id_74,item_category_id_75,item_category_id_76,item_category_id_77,item_category_id_78,item_category_id_79,item_category_id_80,item_category_id_81,item_category_id_82,item_category_id_83
0,1,93.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,283.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,294.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,560.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,806.0,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1322.0,10.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1925.0,9.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,2060.0,14.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,2100.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,2231.0,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
X = pd.DataFrame(grouped_dm.loc[:, 'date_block_num']).join(grouped_dm.loc[:, 'shop_id_0':'item_category_id_83'])
Y = grouped_dm.loc[:, 'item_cnt_month']

lm.fit(X,Y)

preds = lm.predict(X)
rmse_rf = np.sqrt(mean_squared_error(Y, preds))

In [56]:
from sklearn.model_selection import KFold, cross_val_score

k_fold = KFold(n_splits = 50, shuffle = True, random_state = 0)
cv_lm = cross_val_score(lm, X, Y, cv=k_fold, scoring='neg_mean_squared_error').mean()

In [57]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf.fit(X, Y)

preds_rf = rf.predict(X)

In [58]:
k_fold_rf = KFold(n_splits = 5, shuffle = True, random_state = 0)
rmse_rf = np.sqrt(mean_squared_error(Y, preds_rf))
cv_rf = cross_val_score(rf, X, Y, cv=k_fold_rf, scoring='neg_mean_squared_error' ).mean()

In [59]:
rmse_lm = np.sqrt(mean_squared_error(Y, preds))
print(rmse_lm)
print(cv_lm)
print(rmse_rf)
print(cv_rf)

20.87904091008732
-8.960095610583777e+17
18.953934239931055
-416.171558120589


ValueError: No axis named item_category_id for object type <class 'pandas.core.frame.DataFrame'>