## V9

In [None]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime, timedelta

pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

CATEGORY_FREQ = 100

In [None]:
# get valid shop id
def valid_shop_id(id):
    if id == 0:
        return 57
    if id == 1:
        return 58
#    if id == 23:
#        return 24
    if id == 11:
        return 10
    if id == 40:
        return 39 
    return id

In [None]:
df_test = pd.read_csv("test.csv")
df_test.describe()

In [None]:
df_sales = pd.read_csv("sales_train.csv")
df_sales.describe()

In [None]:
%%time
df_sales['shop_id'] = df_sales['shop_id'].apply(lambda x: valid_shop_id(x))
df_sales = df_sales.loc[df_sales['item_cnt_day']>0]
df_sales = df_sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': 'sum', 'item_price': 'mean'}).reset_index()
df_sales.head()

In [None]:
#Clip
df_sales['item_cnt_day'] = np.clip(df_sales['item_cnt_day'], 0, 20)

In [None]:
df_sales['month'] = df_sales['date_block_num']%12 + 1
df_test['month'] = 11
df_test['date_block_num'] = 34

In [None]:
df_sales.tail()

In [None]:
df_sales.describe()

In [None]:
df_sales.info()

In [None]:
#df = df_sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values='item_cnt_day', fill_value=np.nan).reset_index()
#df.head

In [None]:
#df.loc[(df['shop_id']==2)&(df['item_id']==27), ''.join(('item_prev', str(1)))]

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score
import datetime as dt
import holidays
from itertools import product
import re


MAX_ITEM_FEATURES = 25
date_ix = 0
start_date = dt.datetime.strptime("2013-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2015-11-30", "%Y-%m-%d")

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes_names):
        self.attributes_names = attributes_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes_names].values # convert to NumPy array

class CycleTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self,  cycle_columns ):
        self._cycle_columns = cycle_columns
        self._cycle_stats = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        for column in self._cycle_columns:
            self._cycle_stats[column] = { 'max': X[column].max(), 'min': X[column].min() }
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._cycle_columns:
            self._df[column+'_sin'] = np.sin(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))
            self._df[column+'_cos'] = np.cos(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))

        #self._df = self._df.reset_index(drop=True)
        #print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._df.values

class ItemTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self, max_features ):
        self._df_items = pd.read_csv("items.csv")
        self._snowball = SnowballStemmer(language="russian")
        self._russian_stop_words = stopwords.words("russian")
        self._vectorizer = TfidfVectorizer(tokenizer=lambda x: self.__tokenize_sentence(x), max_features=max_features)
    
    def __tokenize_sentence(self, sentence: str):
        self._tokens = word_tokenize(sentence, language="russian")
        self._tokens = [i for i in self._tokens if i not in string.punctuation]
        self._tokens = [i for i in self._tokens if i not in self._russian_stop_words]
        self._tokens = [self._snowball.stem(i) for i in self._tokens]
        return self._tokens

    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        self._vectorizer.fit(self._df_items['item_name'])
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
#        self._df_names = pd.DataFrame(index=X.index)
        self._df_names = X.merge(self._df_items, how='left', on='item_id')
        self._features = self._vectorizer.transform(self._df_names['item_name'])
        #self._df = self._df.reset_index(drop=True)
        #print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._features


class DateTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self,  start_date, end_date ):
        self._ru_holidays = holidays.Russia()
        self._calendar = pd.Series([start_date + dt.timedelta(days=x) for x in range(0, (end_date - start_date + dt.timedelta(days=1)).days)]).rename("date").to_frame()
        self._date_dict= {}

    def __get_holydays(self, block_num):
        return self._date_dict['holyday'][block_num]

    def __get_weekends(self, block_num):
        return self._date_dict['weekend'][block_num]    

    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        self._calendar["bank_holiday"] = self._calendar["date"].apply(lambda x: self._ru_holidays.get(x))
        self._calendar["weekday"] = self._calendar["date"].apply(lambda x: dt.date.isoweekday(x))
        self._calendar['weekend'] = self._calendar['weekday'].apply(lambda x: 1 if x in (6,7) else 0)
        self._calendar['holyday'] = self._calendar['bank_holiday'].apply(lambda x: 1 if x is not None else 0)
        self._calendar['date_block_num'] = self._calendar['date'].apply(lambda x: (x.year-2013)*12+x.month-1)
        self._date_dict = self._calendar[['date_block_num','weekend','holyday']].groupby('date_block_num').sum().to_dict()
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        self._df['holydays'] = X['date_block_num'].apply(lambda x: self.__get_holydays(x))
        self._df['weekends'] = X['date_block_num'].apply(lambda x: self.__get_weekends(x))
        #print('Date transform shape is {}'.format(self._df.values.shape))
        return self._df.values

class ValueTransformator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._sales = None

    def fit(self, X, y=None):
        index_cols = ['date_block_num', 'shop_id', 'item_id']
        shops = X['shop_id'].unique()
        items = X['item_id'].unique()
        date_blocks = X['date_block_num'].unique()
        grid = np.array(list(product(*[date_blocks,shops,items])))
        #print(grid)
        date_block_max = X['date_block_num'].max() + 1
        self._sales = pd.DataFrame(grid, columns = index_cols,dtype=np.int32)
        self._sales.to_csv('sales_grid.csv')   
        self._sales = self._sales.merge(X, on=index_cols, how='left').fillna(np.nan).reset_index()
        self._sales = self._sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values='item_cnt_day', fill_value=np.nan).reset_index()
        self._sales['item_prev0'] = np.nan
        self._sales['item_mean3_0'] = np.nan
        self._sales['item_mean6_0'] = np.nan
        for col in range(1, date_block_max):
            self._sales[''.join(('item_prev', str(col)))] = self._sales[col-1]
            self._sales[''.join(('item_prev_diff', str(col)))] = self._sales[''.join(('item_prev', str(col)))] - self._sales[''.join(('item_prev', str(col-1)))]
            self._sales[''.join(('item_mean3_', str(col)))] = np.nan

            if col > 2:
                self._sales[''.join(('item_mean3_', str(col)))] = np.nanmean([self._sales[col-1], self._sales[col-2], self._sales[col-3]])
        
            self._sales[''.join(('item_mean6_', str(col)))] = np.nan
            if col > 5:
                self._sales[''.join(('item_mean6_', str(col)))] = np.nanmean([self._sales[col-1], self._sales[col-2], self._sales[col-3], self._sales[col-4], self._sales[col-5], self._sales[col-6]])
        self._sales.to_csv('sales_fitted.csv')   
        return self

    def transform(self, X):
        X[['item_prev', 'item_mean3', 'item_mean6']] = np.nan
        for index, row in X.iterrows():
            block, shop, item = row['date_block_num'].astype(int), row['shop_id'].astype(int), row['item_id'].astype(int)
#            print('Index: ',index, ', block: ', block, ', shop: ', shop, ', item: ', item)
#            item_prev = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_prev', str(block)))]
#            item_mean3 = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_mean3_', str(block)))]
#            item_mean6 = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_mean6_', str(block)))]
#            print('item_prev: ', item_prev, ', item_mean3: ', item_mean3, ', item_mean6: ', item_mean6)
#            X.iloc[index]['item_prev'] = item_prev
#            X.iloc[index]['item_mean3'] = item_mean3
#            X.iloc[index]['item_mean6'] = item_mean6
            X.at[index, 'item_prev'] = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_prev', str(block)))]
            X.at[index, 'item_mean3'] = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_mean3_', str(block)))]
            X.at[index, 'item_mean6'] =self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('item_mean6_', str(block)))]
        return X[['item_prev', 'item_mean3', 'item_mean6']].values # convert to NumPy array

class PriceTransformator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._sales = None
    def fit(self, X, y=None):
        index_cols = ['shop_id', 'item_id', 'date_block_num']
        shops = X['shop_id'].unique()
        items = X['item_id'].unique()
        date_blocks = X['date_block_num'].unique()
        grid = np.array(list(product(*[date_blocks,shops,items])))
        date_block_max = X['date_block_num'].max() + 1
        self._sales = pd.DataFrame(grid, columns = index_cols,dtype=np.int32)
        self._sales = self._sales.merge(X, on=index_cols, how='left').fillna(np.nan).reset_index()
        self._sales = self._sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values='item_price', fill_value=np.nan).reset_index()
        self._sales['price_prev0'] = np.nan
        self._sales['price_mean3_0'] = np.nan
        self._sales['price_mean6_0'] = np.nan
        for col in range(1, date_block_max):
            self._sales[''.join(('price_prev', str(col)))] = self._sales[col-1]
            self._sales[''.join(('price_prev_diff', str(col)))] = self._sales[''.join(('price_prev', str(col)))] - self._sales[''.join(('price_prev', str(col-1)))]
            self._sales[''.join(('price_mean3_', str(col)))] = np.nan

            if col > 2:
                self._sales[''.join(('price_mean3_', str(col)))] = np.nanmean([self._sales[col-1], self._sales[col-2], self._sales[col-3]])
        
            self._sales[''.join(('price_mean6_', str(col)))] = np.nan
            if col > 5:
                self._sales[''.join(('price_mean6_', str(col)))] = np.nanmean([self._sales[col-1], self._sales[col-2], self._sales[col-3], self._sales[col-4], self._sales[col-5], self._sales[col-6]])
           
        return self

    def transform(self, X):
        X[['price_prev', 'price_mean3', 'price_mean6']] = np.nan
        for index, row in X.iterrows():
            block, shop, item = row['date_block_num'].astype(int), row['shop_id'].astype(int), row['item_id'].astype(int)
            X.iloc[index]['price_prev'] = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('price_prev', str(block)))]
            X.iloc[index]['price_mean3'] = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('price_mean3_', str(block)))]
            X.iloc[index]['price_mean6'] = self._sales.loc[(self._sales['shop_id']==shop)&(self._sales['item_id']==item), ''.join(('price_mean6_', str(block)))]
        return X[['price_prev', 'price_mean3', 'price_mean6']].values # convert to NumPy array
    
class ShopTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self ):
        df_shops = pd.read_csv("shops.csv", index_col='shop_id')
        df_shops.drop([0,1], inplace=True)
        df_shops['city'] = df_shops['shop_name'].apply(lambda x: self.__shop_city(x))
        df_shops['type'] = df_shops['shop_name'].apply(lambda x: self.__shop_type(x))
        self.__shops = df_shops[['city', 'type']].to_dict()
        
    
    def __shop_type(self, shop):
        #types = ['ТК', 'ТЦ', 'ТРК', 'ТРЦ']
        p = re.compile(r"(Т[РКЦ]+)")
        r = p.search(shop)
        if r is not None: return r.group(1) 
        if 'нлайн' in shop: return 'Онлайн'
        if 'нтернет' in shop: return 'Онлайн'
        return 'Магазин'

    # get city
    def __shop_city(self, shop):
        p = re.compile(r"^([а-яА-Я\.]*)")
        r = p.search(shop)
        if r is not None: return r.group(1) 
        return 'Unknown'

    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        self._df['city'] = X['shop_id'].apply(lambda x: self.__shops['city'][x])
        self._df['type'] = X['shop_id'].apply(lambda x: self.__shops['type'][x])
        #print('Date transform shape is {}'.format(self._df.values.shape))
        return self._df.values

class CategoriesTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self ):
        df_items = pd.read_csv("items.csv", index_col='item_id')
        df_categories = pd.read_csv("item_categories.csv")
        df_categories = df_items.merge(df_categories, how='left', on='item_category_id')
        df_categories['category'] = df_categories['item_category_name'].str.split('[-(]', n=0).str[0].str.strip()
        df_categories['digital'] = df_categories['item_category_name'].str.contains('цифра', case=False).astype(int)
        self.__categories = df_categories[['category', 'digital']].to_dict()
        
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        self._df['category'] = X['item_id'].apply(lambda x: self.__categories['category'][x])
        self._df['digital'] = X['item_id'].apply(lambda x: self.__categories['digital'][x])
        #print('Date transform shape is {}'.format(self._df.values.shape))
        return self._df.values


In [None]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
from sklearn.metrics import make_scorer

def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

In [None]:
#num_attribs = ['item_cnt_prev_month','item_cnt_prev_diff','prev_itemQ','item_cnt_prev_year']
#num_attribs = ['prev_month','prev_diff', 'prev_itemQ', 'mean3', 'mean6', 'weekends', 'holydays', 'prev_item_price', 'prev_shop_price', 'prev_item_month', 'prev_shop_month']
#num_attribs = ['prev_month', 'prev_itemQ', 'mean3', 'mean6', 'weekends', 'holydays', 'prev_item_price', 'prev_shop_price', 'prev_item_month', 'prev_shop_month']
#l12_cols = item_labels.columns.values
#num_attribs = np.concatenate([num_attribs,l12_cols])
#num_attribs = ['digital']
#cat_attribs = ['shop_cluster', 'category_cluster']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster', 'shop_type', 'subcategory', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['shop_type', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['shop_type', 'category', 'city']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster', 'city', 'shop_type', 'category']
#cat_attribs = ['shop_cluster', 'category']
#num_attribs = ['item_id','shop_id','digital']
#cat_attribs = ['category']
date_attribs = ['month']

#num_pipeline = Pipeline([
#    ('selector', DataFrameSelector(num_attribs)),
##    ('imputer', SimpleImputer(strategy="median")),
#    ('std_scaler', StandardScaler()),
#])
#cat_pipeline = Pipeline([
#    ('selector', DataFrameSelector(cat_attribs)),
#    ('cat_encoder', OneHotEncoder(sparse=False)),
#])
shop_pipeline = Pipeline([
    ('shop_transformator', ShopTransformator()),
    ('shop_encoder', OneHotEncoder(sparse=False)),
])

categories_pipeline = Pipeline([
    ('cat_transformator', CategoriesTransformator()),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

item_pipeline = Pipeline([
    ('item_transformator', ItemTransformator(max_features=MAX_ITEM_FEATURES)),
])
date_pipeline = Pipeline([
    ('date_transformator', DateTransformator(start_date=start_date, end_date=end_date)),
    ('date_scaler', StandardScaler()),
])
value_pipeline = Pipeline([
    ('value_transformator', ValueTransformator()),
    ('value_scaler', StandardScaler()),
])
price_pipeline = Pipeline([
    ('price_transformator', PriceTransformator()),
    ('price_scaler', StandardScaler()),
])
counted_pipeline = Pipeline([
    ('cycle_transformator', CycleTransformator(cycle_columns=date_attribs)),
])

#all_features_pipeline = FeatureUnion(transformer_list=[
full_pipeline = FeatureUnion(transformer_list=[
#    ('shop_pipeline',      shop_pipeline),
#    ('catgories_pipeline', categories_pipeline),
#    ('item_pipeline',      item_pipeline),
#    ('date_pipeline',      date_pipeline),
    ('value_pipeline',     value_pipeline),
#    ('price_pipeline',     price_pipeline),
#    ('counted_pipeline',   counted_pipeline),
])

#full_pipeline = FeatureUnion(transformer_list=[
#    ('all_features_pipeline', all_features_pipeline),
#    ('std_scaler', StandardScaler()),
#])


In [None]:
#from sklearn.model_selection import TimeSeriesSplit

#df_sells_in_month = df_sells_in_month.dropna()

#X_train_data = df_sells_in_month[df_sells_in_month['date_block_num'] < 24].copy()
#X_test_data = df_sells_in_month[df_sells_in_month['date_block_num'] > 23].copy()
#X_train = X_train_data[['item_cnt_prev_month','revenue_prev', 'item_cnt_prev_diff','revenue_prev', 'prev_itemQ', 'prev_revenueQ', 'shop_type', 'shop_mega', 'shop_cluster', 'city', 'city_cluster', 'category', 'subcategory', 'digital', 'category_cluster', 'month']]
#X_test = X_test_data[['item_cnt_prev_month','revenue_prev', 'prev_itemQ', 'prev_revenueQ', 'shop_type', 'shop_mega', 'shop_cluster', 'city', 'city_cluster', 'category', 'subcategory', 'digital', 'category_cluster', 'month']]

#X_train = X_train_data[['item_cnt_prev_month','revenue_prev', 'prev_itemQ', 'prev_revenueQ', 'shop_cluster', 'city_cluster', 'category_cluster', 'month']]
#X_test = X_test_data[['item_cnt_prev_month','revenue_prev', 'prev_itemQ', 'prev_revenueQ', 'shop_cluster', 'city_cluster', 'category_cluster', 'month']]

X_all = df_sales
X_train = df_sales[df_sales['date_block_num'] < 24]
X_test = df_sales[df_sales['date_block_num'] > 23]


Y_train = X_train['item_cnt_day'].copy()
Y_test = X_test['item_cnt_day'].copy()



In [None]:
full_pipeline.fit(X_all)
X_train_prepared = full_pipeline.transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
X_pred_prepared = full_pipeline.transform(df_test)
X_train_prepared.shape

In [None]:
#from sklearn.preprocessing import PolynomialFeatures

#poly = PolynomialFeatures(degree=2)
#X_train_prepared = poly.fit_transform(X_train_prepared)
#X_test_prepared = poly.transform(X_test_prepared)
#X_train_prepared.shape

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std Deviation:", scores.std())

In [None]:
%%time

from sklearn.tree import DecisionTreeRegressor

# train
tree_reg = DecisionTreeRegressor(random_state=57)
tree_reg.fit(X_train_prepared, Y_train)

# predict
predictions = tree_reg.predict(X_train_prepared)

#scores = cross_val_score(tree_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2")
#print('%s: %f (%f)' % ('Tree: ', scores.mean(), scores.std()))
#tree_rmse_scores = np.sqrt(-scores)
#display_scores(tree_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = tree_reg.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))
#regression_results(Y_pred , Y_test)

In [None]:
%%time
from sklearn.linear_model import LinearRegression

# train
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, Y_train)
# predict
predictions = lin_reg.predict(X_train_prepared)

#scores = cross_val_score(lin_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2")
#print('%s: %f (%f)' % ('LinReg: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = lin_reg.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))
#regression_results(Y_test, Y_pred)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=57, n_jobs=3, verbose=1)
print('Fitting...')
forest_reg.fit(X_train_prepared, Y_train)
# predict
print('Predicting...')
predictions = forest_reg.predict(X_train_prepared)

#print('Cross validating...')
#scores = cross_val_score(forest_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2", n_jobs=4, verbose=1)
#print('%s: %f (%f)' % ('Forest: ', scores.mean(), scores.std()))
#forest_rmse_scores = np.sqrt(-scores)
#display_scores(forest_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = forest_reg.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
feature_importances = forest_reg.feature_importances_

In [None]:
#cat_encoder = cat_pipeline.named_steps["cat_encoder"]
#cat_one_hot_attribs = list(cat_encoder.categories_[0]) + list(cat_encoder.categories_[1]) + list(cat_encoder.categories_[2])
counted_encoder = counted_pipeline.named_steps["cycle_transformator"]
counted_attribs = list(counted_encoder._df.columns)
#attributes = num_attribs + cat_one_hot_attribs + counted_attribs
#attributes = np.concatenate([num_attribs, l12_cols, cat_one_hot_attribs, counted_attribs])
attributes = np.concatenate([num_attribs, l12_cols, counted_attribs])
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
%%time
# Lasso
from sklearn.linear_model import Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha)

lasso.fit(X_train_prepared, Y_train)
# predict
predictions = lasso.predict(X_train_prepared)

scores = cross_val_score(lasso, X_train_prepared, Y_train, cv=tscv, scoring="r2")
print('%s: %f (%f)' % ('Lasso: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = lasso.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
%%time
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.8)
enet.fit(X_train_prepared, Y_train)
# predict
predictions = enet.predict(X_train_prepared)

scores = cross_val_score(enet, X_train_prepared, Y_train, cv=tscv, scoring="r2")
print('%s: %f (%f)' % ('Enet: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = enet.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
%%time
from xgboost import XGBRegressor

boost_reg = XGBRegressor(random_state=57, verbosity=1)
print('Fitting...')
boost_reg.fit(X_train_prepared, Y_train)
# predict
print('Predicting...')
predictions = boost_reg.predict(X_train_prepared)

#print('Cross validating...')
#scores = cross_val_score(forest_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2", n_jobs=4, verbose=1)
#print('%s: %f (%f)' % ('Forest: ', scores.mean(), scores.std()))
#forest_rmse_scores = np.sqrt(-scores)
#display_scores(forest_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = boost_reg.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
%%time
from sklearn.neighbors import KNeighborsRegressor

# train
neighbor_reg = KNeighborsRegressor(n_jobs=3)
print('Fitting...')
neighbor_reg.fit(X_train_prepared, Y_train)
# predict
print('Predicting...')
predictions = neighbor_reg.predict(X_train_prepared)

#print('Cross validating...')
#scores = cross_val_score(neighbor_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2", n_jobs=4, verbose=1)
#print('%s: %f (%f)' % ('NeighborReg: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = neighbor_reg.predict(X_test_prepared)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
#df_test['34_scaled'] = (20*(df_test[34] - np.min(df_test[34]))/np.ptp(df_test[34]))   
#df_test['34'] = np.clip(df_test[34], 0, 20)

In [None]:
#X_prepared = full_pipeline.transform(X_pred)
#Y_pred = enet.predict(X_pred_prepared)
Y_pred = boost_reg.predict(X_pred_prepared)

In [None]:
df_sells_in_month_34['item_cnt_month'] = 0
df_sells_in_month_34.loc[df_sells_in_month_34['prev_month'] > 0, 'item_cnt_month'] = np.clip(Y_pred, 0, 20)
df_sells_in_month_34['item_cnt_month'].describe()

In [None]:
df_test = pd.merge(df_test, df_sells_in_month_34, on=['shop_id','item_id'], how='left')
df_test = df_test.fillna(0)
df_test.head()

In [None]:
df_test[['ID', 'item_cnt_month']].to_csv('submission111_1.csv', index=False)

In [None]:
#df_submission = pd.read_csv('sample_submission.csv')
#df_submission['item_cnt_month'] = df_test['34_scaled']
#df_submission['item_cnt_month'] = np.clip(df_prediction['item_cnt_month'], 0, 20)
#df_submission['item_cnt_month'] = np.clip(Y_pred, 0, 20)
#df_submission.to_csv('submission107_1.csv', index=False)
#df_submission.head()

In [None]:
np.min(Y_pred)

submission107_2.csv
a few seconds ago by Andrey Vest

RandomForest, merge test after prediction, previous , -cluster features + item features, clip only targets
1.14185