In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime, timedelta

pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Use 3 decimal places in output display
pd.set_option("display.precision", 3)


In [2]:
df_shops = pd.read_csv("shops.csv")
df_shops.describe()

Unnamed: 0,shop_id
count,60.0
mean,29.5
std,17.46425
min,0.0
25%,14.75
50%,29.5
75%,44.25
max,59.0


In [3]:
# get valid shop id
def valid_shop_id(id):
    if id == 0:
        return 57
    if id == 1:
        return 58
#    if id == 23:
#        return 24
#    if id == 11:
#        return 10
#    if id == 40:
#        return 39 
    return id

def shop_type(shop):
    #types = ['ТК', 'ТЦ', 'ТРК', 'ТРЦ']
    p = re.compile(r"(Т[РКЦ]+)")
    r = p.search(shop)
    if r is not None: return r.group(1) 
    if 'нлайн' in shop: return 'Онлайн'
    if 'нтернет' in shop: return 'Онлайн'
    return 'Магазин'

# get city
def shop_city(shop):
    p = re.compile(r"^([а-яА-Я\.]*)")
    r = p.search(shop)
    if r is not None: return r.group(1) 
    return 'Unknown'

#df_shops['valid_id'] = df_shops['shop_id'] 

In [4]:
df_shops.drop([0,1], inplace=True)

In [5]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: shop_city(x))
df_shops['type'] = df_shops['shop_name'].apply(lambda x: shop_type(x))
df_shops['mega'] = df_shops['shop_name'].str.contains('мега', case=False).astype(int)

In [6]:
df_shops.head()

Unnamed: 0,shop_name,shop_id,city,type,mega
2,"Адыгея ТЦ ""Мега""",2,Адыгея,ТЦ,1
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха,ТРК,0
4,"Волжский ТЦ ""Волга Молл""",4,Волжский,ТЦ,0
5,"Вологда ТРЦ ""Мармелад""",5,Вологда,ТРЦ,0
6,"Воронеж (Плехановская, 13)",6,Воронеж,Магазин,0


In [7]:
df_categories = pd.read_csv("item_categories.csv")
df_categories.describe()

Unnamed: 0,item_category_id
count,84.0
mean,41.5
std,24.39262
min,0.0
25%,20.75
50%,41.5
75%,62.25
max,83.0


In [8]:
df_categories['category'] = df_categories['item_category_name'].str.split('[-(]', n=0).str[0].str.strip()
df_categories['digital'] = df_categories['item_category_name'].str.contains('цифра', case=False).astype(int)

In [9]:
df_items = pd.read_csv("items.csv")
df_items.describe()

Unnamed: 0,item_id,item_category_id
count,22170.0,22170.0
mean,11084.5,46.29075
std,6400.07207,15.94149
min,0.0,0.0
25%,5542.25,37.0
50%,11084.5,40.0
75%,16626.75,58.0
max,22169.0,83.0


In [10]:
federal_cities = ['Москва', 'СПб']
milioner_cities = ['Воронеж', 'Казань', 'Красноярск', 'Н.Новгород', 'Новосибирск', 'Омск', 'РостовНаДону', 'Самара', 'Уфа']
regional_caps = ['Адыгея', 'Калуга', 'Курск', 'Тюмень', 'Якутск', 'Ярославль', 'Вологда', 'Томск']
regional_cities = ['Балашиха', 'Волжский', 'Жуковский', 'Коломна', 'Сергиев', 'Сургут', 'Химки', 'Чехов', 'Мытищи']

def get_city_type(city):
    if city in federal_cities: return 'Federal' 
    if city in milioner_cities: return 'Milioner' 
    if city in regional_caps: return 'RegionalCenter' 
    if city in regional_cities: return 'Regional' 
    return 'Virtual'

def get_category(item_id):
    category_id = df_items.loc[df_items['item_id'] == item_id]['item_category_id'].values[0]
    return df_categories.loc[df_categories['item_category_id'] == category_id]['category'].values[0]

def get_category_digitality(item_id):
    category_id = df_items.loc[df_items['item_id'] == item_id]['item_category_id'].values[0]
    return df_categories.loc[df_categories['item_category_id'] == category_id]['digital'].values[0]

def get_shop_type(shop_id):
    return df_shops.loc[df_shops['shop_id'] == shop_id]['type'].values[0]

def get_shop_megality(shop_id):
    return df_shops.loc[df_shops['shop_id'] == shop_id]['mega'].values[0]

def get_shop_city(shop_id):
    return df_shops.loc[df_shops['shop_id'] == shop_id]['city'].values[0]

In [11]:
df_sales = pd.read_csv("sales_train.csv")
df_sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.22706,890.85323,1.24264
std,9.42299,16.22697,6324.29735,1729.79963,2.61883
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [12]:
df_sales['shop_id'] = df_sales['shop_id'].apply(lambda x: valid_shop_id(x))
df_sales['revenue'] = df_sales['item_cnt_day'] * df_sales['item_price']

In [13]:
#df_monthly_sales = df_sales.groupby(['date_block_num','shop_id','item_id'])[['item_cnt_day', 'revenue']].sum().reset_index()#.to_frame(name='item_cnt_month').reset_index()
df_monthly_sales = df_sales.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].sum().to_frame(name='item_cnt_month').reset_index()
df_monthly_sales.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,2,27,1.0
1,0,2,33,1.0
2,0,2,317,1.0
3,0,2,438,1.0
4,0,2,471,2.0


In [14]:
df_monthly_sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
count,1609124.0,1609124.0,1609124.0,1609124.0
mean,14.66479,33.09288,10680.99027,2.2672
std,9.54232,16.46666,6238.8831,8.64988
min,0.0,2.0,0.0,-22.0
25%,6.0,21.0,5045.0,1.0
50%,14.0,31.0,10497.0,1.0
75%,23.0,48.0,16060.0,2.0
max,33.0,59.0,22169.0,2253.0


In [15]:
df_monthly_sales['item_cnt_month'] = np.clip(df_monthly_sales['item_cnt_month'], 0, 20)

In [16]:
df_monthly_sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
count,1609124.0,1609124.0,1609124.0,1609124.0
mean,14.66479,33.09288,10680.99027,2.02281
std,9.54232,16.46666,6238.8831,2.57796
min,0.0,2.0,0.0,0.0
25%,6.0,21.0,5045.0,1.0
50%,14.0,31.0,10497.0,1.0
75%,23.0,48.0,16060.0,2.0
max,33.0,59.0,22169.0,20.0


In [17]:
df = df_monthly_sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values='item_cnt_month', fill_value=0)
df.reset_index(inplace=True)
df.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,2,27,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,30,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,31,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,32,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,2,33,1,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0


In [18]:
df_test = pd.read_csv("test.csv")
df_test.describe()

Unnamed: 0,ID,shop_id,item_id
count,214200.0,214200.0,214200.0
mean,107099.5,31.64286,11019.39863
std,61834.35817,17.56193,6252.64459
min,0.0,2.0,30.0
25%,53549.75,16.0,5381.5
50%,107099.5,34.5,11203.0
75%,160649.25,47.0,16071.5
max,214199.0,59.0,22167.0


In [19]:
df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
df_test = df_test.fillna(0)
df_test.head()

Unnamed: 0,ID,shop_id,item_id,0,1,2,3,4,5,6,...,24,25,26,27,28,29,30,31,32,33
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_test.drop(['ID'], axis=1, inplace=True)

In [21]:
%%time
df_test['shop_type'] = df_test['shop_id'].apply(get_shop_type)

Wall time: 1min 19s


In [22]:
%%time
df_test['shop_mega'] = df_test['shop_id'].apply(lambda x: get_shop_megality(x))

Wall time: 1min 17s


In [23]:
%%time
df_test['category'] = df_test['item_id'].apply(lambda x: get_category(x))

Wall time: 2min 39s


In [24]:
%%time
df_test['digital'] = df_test['item_id'].apply(lambda x: get_category_digitality(x))

Wall time: 2min 39s


In [25]:
%%time
df_test['city'] = df_test['shop_id'].apply(lambda x: get_shop_city(x))

Wall time: 1min 21s


In [26]:
%%time
df_test['city_type'] = df_test['city'].apply(lambda x: get_city_type(x))

Wall time: 65.8 ms


In [27]:
df_test.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,...,30,31,32,33,shop_type,shop_mega,category,digital,city,city_type
0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,1.0,0.0,ТРЦ,0,Игры,0,Вологда,RegionalCenter
1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,ТРЦ,0,Музыка,0,Вологда,RegionalCenter
2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,ТРЦ,0,Игры,0,Вологда,RegionalCenter
3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,ТРЦ,0,Игры,0,Вологда,RegionalCenter
4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,ТРЦ,0,Игры,0,Вологда,RegionalCenter


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

date_ix = 0

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes_names):
        self.attributes_names = attributes_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes_names].values # convert to NumPy array

class CycleTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self,  cycle_columns ):
        self._cycle_columns = cycle_columns
        self._cycle_stats = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        for column in self._cycle_columns:
            self._cycle_stats[column] = { 'max': X[column].max(), 'min': X[column].min() }
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._cycle_columns:
            self._df[column+'_sin'] = np.sin(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))
            self._df[column+'_cos'] = np.cos(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))

        #self._df = self._df.reset_index(drop=True)
        #print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [44]:
num_attribs = [i for i in range(33)]
cat_attribs = ['city_type','shop_type', 'category','shop_mega','digital']
#num_attribs = ['item_id','shop_id','digital']
#cat_attribs = ['category']
date_attribs = ['month']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
#    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])
counted_pipeline = Pipeline([
    ('cycle_transformator', CycleTransformator(cycle_columns=date_attribs)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
#    ('counted_pipeline', counted_pipeline),
])

In [45]:
X_train = df_test.copy()
X_train_prepared = full_pipeline.fit_transform(X_train)
X_train_prepared.shape

(214200, 64)

In [46]:
#X_train_data = X_test[[i for i in range(33)]]
Y_train = X_train[33]

In [47]:
#X_train = np.concatenate((X_prepared, X_train_data), axis=1)
#X_train.shape

In [48]:
num_attribs = [i for i in range(1, 34)]
X_test = full_pipeline.transform(X_train)
X_test.shape

(214200, 64)

In [49]:
#X_test_all = np.concatenate((X_prepared, X_test_data), axis=1)
#X_test_all.shape

In [50]:
#X_train = df_test.drop([33], axis=1)
#Y_train = df_test[33].values
#X_test = df_test.drop([0], axis=1)
#Y_train = np.clip(Y_train, 0, 20)

In [51]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std Deviation:", scores.std())

# train
tree_reg = DecisionTreeRegressor(random_state=57)
tree_reg.fit(X_train_prepared, Y_train)
# predict
predictions = tree_reg.predict(X_train_prepared)

scores = cross_val_score(tree_reg, X_train_prepared, Y_train, cv=3,scoring="neg_mean_squared_error")
tree_rmse_scores = np.sqrt(-scores)

display_scores(tree_rmse_scores)
#print("Metric: %.2f" % CountScore(Y_train.values, predictions) )
print("R2-score: %.2f" % r2_score(predictions , Y_train) )

Scores: [1.34906669 1.0130688  0.84789479]
Mean: 1.0700100949798188
Std Deviation: 0.20852665798568643
R2-score: 0.42
Wall time: 10.4 s


In [52]:
from sklearn.linear_model import LinearRegression

# train
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, Y_train)
# predict
predictions = lin_reg.predict(X_train_prepared)

scores = cross_val_score(lin_reg, X_train_prepared, Y_train, cv=3, scoring="neg_mean_squared_error")
lin_rmse_scores = np.sqrt(-scores)

display_scores(lin_rmse_scores)
print("R2-score: %.2f" % r2_score(predictions , Y_train) )

Scores: [1.05319516 0.87580942 0.71794713]
Mean: 0.8823172348930833
Std Deviation: 0.1369417719186404
R2-score: -0.98


In [53]:
%%time
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=57, n_jobs=4, verbose=1)
print('Fitting...')
forest_reg.fit(X_train_prepared, Y_train)
# predict
print('Predicting...')
predictions = forest_reg.predict(X_train_prepared)

print('Cross validating...')
scores = cross_val_score(forest_reg, X_train_prepared, Y_train, cv=3, scoring="neg_mean_squared_error", n_jobs=4, verbose=1)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
print("R2-score: %.2f" % r2_score(predictions , Y_train) )

Fitting...


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   31.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Predicting...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Cross validating...
Scores: [1.06783534 0.86472873 0.72798455]
Mean: 0.886849540593153
Std Deviation: 0.13962243649304915
R2-score: 0.19
Wall time: 3min 11s


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  2.0min finished


In [None]:
#df_test[[10,22,33]].describe()

In [None]:
#df_test.loc[df_test[10] == 989]

In [None]:
#df_test.loc[df_test[22] == 772]

In [None]:
#df_test.loc[df_test[33] == 2253]

In [55]:
%%time
forest_reg.fit(X_train_prepared, Y_train)
df_test[34] = forest_reg.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   31.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s


Wall time: 1min 12s


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.9s finished


In [56]:
df_test[34].describe()

count   214200.00000
mean         0.26361
std          0.77134
min          0.00000
25%          0.02443
50%          0.07096
75%          0.18559
max         20.00000
Name: 34, dtype: float64

In [57]:
#df_test['34_scaled'] = (20*(df_test[34] - np.min(df_test[34]))/np.ptp(df_test[34]))   
#df_test['34_scaled'] = np.clip(df_test[34], 0, 20)

In [58]:
#df_test['34_scaled'].describe()

In [59]:
df_submission = pd.read_csv('sample_submission.csv')
#df_submission['item_cnt_month'] = df_test['34_scaled']
df_submission['item_cnt_month'] = df_test[34]
df_submission.to_csv('submission09_2.csv', index=False)
df_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.23
1,1,0.05271
2,2,0.90548
3,3,0.09157
4,4,0.45158


submission09_1.csv
just now by Andrey Vest

Random Forest, merge test, clipped all, shop features + standard scale
1.04870

submission09_2.csv
just now by Andrey Vest

Random Forest, merge test, clipped all, shop & item categories features(city -> city type)
1.05350