# Kaggle: Predict Future Sales

## Goal: predict total sales for every product and store in the next month.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import seaborn as sns

## Load Data

In [2]:
item_df = pd.read_csv('./future-sales/items.csv')
item_cat_df = pd.read_csv('./future-sales/item_categories.csv')
shop_df = pd.read_csv('./future-sales/shops.csv')
train_df = pd.read_csv("./future-sales/sales_train.csv")
test = pd.read_csv("./future-sales/test.csv")

## Check Missing value and outlier

In [3]:
item_df.isnull().sum(), item_cat_df.isnull().sum(), shop_df.isnull().sum()

(item_name           0
 item_id             0
 item_category_id    0
 dtype: int64,
 item_category_name    0
 item_category_id      0
 dtype: int64,
 shop_name    0
 shop_id      0
 dtype: int64)

In [4]:
train_df.date.unique()  # 2013.02.01 ~ 2015.10.13

array(['02.01.2013', '03.01.2013', '05.01.2013', ..., '28.10.2015',
       '25.10.2015', '13.10.2015'], dtype=object)

In [5]:
# Check train data's columns type
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [6]:
# convert date(str type) to datetime type
train_df["date"] = pd.to_datetime(train_df["date"], format="%d.%m.%Y")
print(train_df.date[0].strftime("%m"))
print(train_df.date[0].strftime("%Y-%m"))

01
2013-01


In [7]:
# add month, year column
train_df['year'] = train_df['date'].dt.strftime('%Y')
train_df['month'] = train_df['date'].dt.strftime('%m')

In [8]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month
0,2013-01-02,0,59,22154,999.0,1.0,2013,1
1,2013-01-03,0,25,2552,899.0,1.0,2013,1
2,2013-01-05,0,25,2552,899.0,-1.0,2013,1
3,2013-01-06,0,25,2554,1709.05,1.0,2013,1
4,2013-01-15,0,25,2555,1099.0,1.0,2013,1


In [9]:
# Check "item_cnt_day" column have a outlier 
print(train_df["item_cnt_day"].describe())
# min -22.

count    2.935849e+06
mean     1.242641e+00
std      2.618834e+00
min     -2.200000e+01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.169000e+03
Name: item_cnt_day, dtype: float64


In [10]:
len(test["item_id"].unique()), len(train_df["item_id"].unique())

(5100, 21807)

In [11]:
# 363 items are only in test set
only_test = set(test["item_id"].unique()) - set(train_df["item_id"].unique())
len(only_test)

363

In [12]:
# remove duplicated data from train_df
subset = ["date", "date_block_num", "shop_id", "item_id", "item_cnt_day"]
print(train_df.duplicated(subset=subset).value_counts())
train_df.drop_duplicates(subset=subset, inplace=True)

False    2935825
True          24
dtype: int64


In [13]:
# test set에 없는 상품에 대한 판매량은 예측할 필요 없다.
# test 데이터에 없는 상품 train 데이터 제거
print("Before train shape:", train_df.shape)
test_shops = test.shop_id.unique()
test_items = test.item_id.unique()

train_df = train_df[train_df.shop_id.isin(test_shops)]
train_df = train_df[train_df.item_id.isin(test_items)]
print("After train shape:", train_df.shape)
train_df

Before train shape: (2935825, 8)
After train shape: (1224429, 8)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month
0,2013-01-02,0,59,22154,999.0,1.0,2013,01
10,2013-01-03,0,25,2574,399.0,2.0,2013,01
11,2013-01-05,0,25,2574,399.0,1.0,2013,01
12,2013-01-07,0,25,2574,399.0,1.0,2013,01
13,2013-01-08,0,25,2574,399.0,2.0,2013,01
...,...,...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.0,1.0,2015,10
2935845,2015-10-09,33,25,7460,299.0,1.0,2015,10
2935846,2015-10-14,33,25,7459,349.0,1.0,2015,10
2935847,2015-10-22,33,25,7440,299.0,1.0,2015,10


In [14]:
print("number of shops:", len(train_df.shop_id.unique()))
print("number of items:", len(train_df.item_id.unique()))

number of shops: 42
number of items: 4716


In [15]:
train = pd.DataFrame.copy(train_df)

del train["date"]
del train["date_block_num"]
del train["item_price"]

train

Unnamed: 0,shop_id,item_id,item_cnt_day,year,month
0,59,22154,1.0,2013,01
10,25,2574,2.0,2013,01
11,25,2574,1.0,2013,01
12,25,2574,1.0,2013,01
13,25,2574,2.0,2013,01
...,...,...,...,...,...
2935844,25,7409,1.0,2015,10
2935845,25,7460,1.0,2015,10
2935846,25,7459,1.0,2015,10
2935847,25,7440,1.0,2015,10


In [16]:
# feature date, shop_id, item_id, item_price feature가 결과에 영향을 줄 것이라고 예측
# group train_df as monthly
# train_grp = train.groupby(["date_block_num", "shop_id", "item_id"], as_index=False)["item_cnt_day"].sum()
train_grp = train.groupby(["year", "month", "shop_id", "item_id"], as_index=False)["item_cnt_day"].sum()
monthly_group_df = pd.DataFrame(train_grp)
monthly_group_df

Unnamed: 0,year,month,shop_id,item_id,item_cnt_day
0,2013,01,2,33,1.0
1,2013,01,2,482,1.0
2,2013,01,2,491,1.0
3,2013,01,2,839,1.0
4,2013,01,2,1007,3.0
...,...,...,...,...,...
600154,2015,10,59,22087,6.0
600155,2015,10,59,22088,2.0
600156,2015,10,59,22091,1.0
600157,2015,10,59,22100,1.0


In [17]:
# test는 date_block_num이 34이다. 15년 11월을 예측
test

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
...,...,...,...
214195,214195,45,18454
214196,214196,45,16188
214197,214197,45,15757
214198,214198,45,19648


In [18]:
test_id = test['ID']

del test["ID"]

In [19]:
# test['date_block_num'] = 34

# column 순서 변경
test['item_cnt_day'] = 0
test['year'] = 2015
test['month'] = 11

test = test[["shop_id", "item_id", "item_cnt_day", "year", "month"]]

test

Unnamed: 0,shop_id,item_id,item_cnt_day,year,month
0,5,5037,0,2015,11
1,5,5320,0,2015,11
2,5,5233,0,2015,11
3,5,5232,0,2015,11
4,5,5268,0,2015,11
...,...,...,...,...,...
214195,45,18454,0,2015,11
214196,45,16188,0,2015,11
214197,45,15757,0,2015,11
214198,45,19648,0,2015,11


In [20]:
print("Before grouping test shape:", test.shape)
test = pd.DataFrame(test.groupby(['year', 'month', 'shop_id', 'item_id'], as_index=False)["item_cnt_day"].sum())
print("After test shape:", test.shape)
test

Before grouping test shape: (214200, 5)
After test shape: (214200, 5)


Unnamed: 0,year,month,shop_id,item_id,item_cnt_day
0,2015,11,2,30,0
1,2015,11,2,31,0
2,2015,11,2,32,0
3,2015,11,2,33,0
4,2015,11,2,38,0
...,...,...,...,...,...
214195,2015,11,59,22162,0
214196,2015,11,59,22163,0
214197,2015,11,59,22164,0
214198,2015,11,59,22166,0


In [21]:
# train, test를 한판으로 합침
# 한판으로 만드는 이유는 같은 전처리를 적용하기 위해서 이다. 이미 나눠져 있기 때문에 한판으로 관리할 필요 없어 보인다.
merge_df = pd.concat([train, test])
merge_df

Unnamed: 0,shop_id,item_id,item_cnt_day,year,month
0,59,22154,1.0,2013,01
10,25,2574,2.0,2013,01
11,25,2574,1.0,2013,01
12,25,2574,1.0,2013,01
13,25,2574,2.0,2013,01
...,...,...,...,...,...
214195,59,22162,0.0,2015,11
214196,59,22163,0.0,2015,11
214197,59,22164,0.0,2015,11
214198,59,22166,0.0,2015,11


In [22]:
merge_df1 = merge_df.reset_index(drop=True)
merge_df1

Unnamed: 0,shop_id,item_id,item_cnt_day,year,month
0,59,22154,1.0,2013,01
1,25,2574,2.0,2013,01
2,25,2574,1.0,2013,01
3,25,2574,1.0,2013,01
4,25,2574,2.0,2013,01
...,...,...,...,...,...
1438624,59,22162,0.0,2015,11
1438625,59,22163,0.0,2015,11
1438626,59,22164,0.0,2015,11
1438627,59,22166,0.0,2015,11


## reduce item_id

In [23]:
merge_df1 = pd.merge(merge_df1, item_df, on='item_id', how='left')
merge_df1['month'] = merge_df1["month"].astype(int)
merge_df1['year'] = merge_df1['year'].astype(int)

In [24]:
merge_df_item_id = merge_df1.item_id

del merge_df1["item_id"]
del merge_df1["item_name"]

In [25]:
merge_df1
# del merge_df1["year"]

Unnamed: 0,shop_id,item_cnt_day,year,month,item_category_id
0,59,1.0,2013,1,37
1,25,2.0,2013,1,55
2,25,1.0,2013,1,55
3,25,1.0,2013,1,55
4,25,2.0,2013,1,55
...,...,...,...,...,...
1438624,59,0.0,2015,11,40
1438625,59,0.0,2015,11,40
1438626,59,0.0,2015,11,37
1438627,59,0.0,2015,11,54


In [26]:
train_set = merge_df1.loc[:1224428]
test_set = merge_df1[merge_df1['year'] == 2015]

del train_set["year"]
del test_set["year"]


train_set["item_cnt_mnt"] = train_set["item_cnt_day"]
del train_set['item_cnt_day']

test_set["item_cnt_mnt"] = test_set["item_cnt_day"]
del test_set["item_cnt_day"]

del test_set["item_cnt_mnt"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set["item_cnt_mnt"] = train_set["item_cnt_day"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set["item_cnt_mnt"] = test_set["item_cnt_day"]


In [27]:
train_x = train_set[["shop_id", "month", "item_category_id"]].values
train_y = train_set["item_cnt_mnt"].values
y = train_y
X = np.concatenate((np.ones((len(train_x), 1)), train_x), axis=1)

In [28]:
def compute_cost(X, y, theta):
    m = y.size
    predictions = x.dot(theta)
    sqErrors = (predictions - y)
    
    J = (1.0 / (2 * m)) * sqErrors.T.dot(sqErrors)
    return J

In [29]:
# multivariate linear regression
def minimize_gradient(X, y, theta, iterations = 10000, alpha=0.01):
    m = y.size
    cost_history = []
    theta_history = []
    
    for _ in range(iterations):
        predictions = train_x.dot(theta)

        for i in range(theta.size):
            partial_marginal = train_x[:, i]
            errors_xi = (predictions - train_y) * partial_marginal
            theta[i] = theta[i] - alpha * (1.0 / m) * errors_xi.sum()
        
        if _ % 100 == 0:
            theta_history.append(theta)
            cost_history.append(compute_cost(x, y, theta))

    return theta, np.array(cost_history), np.array(theta_history)

In [None]:
theta_initial = np.random.normal(size=(3,1))

theta, cost_history, theta_history = minimize_gradient(X, y, theta_initial)

test_set 에 item_category_id 제거

데이터가 너무 커서 올라가지 않음,

In [30]:
## Use Sklearn LinearRegression Model
train_x = train_set[["shop_id", "month", "item_category_id"]].values
train_y = train_set["item_cnt_mnt"].values

pred = pd.DataFrame.copy(test_set)

test_set = test_set.values

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_x, train_y)

y_test = model.predict(test_set)

y_test = y_test.astype(int)

pred["item_cnt_mnt"] = y_test

pred

Unnamed: 0,shop_id,month,item_category_id,item_cnt_mnt
762034,7,1,30,1
762035,15,1,75,1
762036,15,1,55,1
762037,15,1,55,1
762038,15,1,55,1
...,...,...,...,...
1438624,59,11,40,1
1438625,59,11,40,1
1438626,59,11,37,1
1438627,59,11,54,1
