# CatBoost Implementation Avito

by: Traci

This notebook builds off several public kernels on kaggle. 

I removed the lightgbm implementation and inserted a CatBoost model. Since there are no public CatBoost kernels that utilizes the tdidf sparse matrices, I applied truncated SVD on them to output 10 columns in `dataframe` format (components). This process, also known as latent semantic analysis (LSA), essentially reduce the number of rows while preserving the similarity structure among columns.

Note: Choosing 10 is not a good number here. Although it does explain 9% of the variance, to build a competitve CatBoost model requires 200-500 componenents. I chose 10 purely for experimentating and speed.

Data can be downloaded from https://www.kaggle.com/c/avito-demand-prediction.


In [1]:
#Initially forked from Bojan's kernel here: https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2242/code
#improvement using kernel from Nick Brook's kernel here: https://www.kaggle.com/nicapotato/bow-meta-text-and-dense-features-lgbm
#Used oof method from Faron's kernel here: https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867
#Used some text cleaning method from Muhammad Alfiansyah's kernel here: https://www.kaggle.com/muhammadalfiansyah/push-the-lgbm-v19
#Forked From - https://www.kaggle.com/him4318/avito-lightgbm-with-ridge-feature-v-2-0

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
random.seed(2018)
from scipy import sparse
print("Data:\n",os.listdir("data"))

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from catboost import CatBoostRegressor
from sklearn.decomposition import TruncatedSVD

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

NFOLDS = 5
SEED = 2018
VALID = True
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
        
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

Data:
 ['aggregated_features.csv', 'aggregated_features_v3.csv', 'aggregated_features_v5.csv', 'periods_test.csv', 'periods_train.csv', 'target_encoded.csv', 'test.csv', 'train.csv']




In [31]:
print("\nData Load Stage")
training = pd.read_csv('data/train.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
traindex = training.index
testing = pd.read_csv('data/test.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))


Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns


In [3]:
print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

Combine Train and Test

All Data shape: 2011862 Rows, 16 Columns


## Add in aggregated features

In [4]:
df = df.reset_index()
gp = pd.read_csv('data/aggregated_features.csv') 
df = df.merge(gp, on='user_id', how='left')
del gp
gc.collect()
df = df.set_index('item_id')

In [None]:
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(df.price.mean(),inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

df["avg_days_up_user"] = np.log(df["avg_days_up_user"]+0.001)
df["avg_days_up_user"].fillna(-999,inplace=True)
df["avg_times_up_user"] = np.log(df["avg_times_up_user"]+0.001)
df["avg_times_up_user"].fillna(-999,inplace=True)
df["n_user_items"] = np.log(df["n_user_items"]+0.001)
df["n_user_items"].fillna(-999,inplace=True)

In [6]:
print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday


Create Time Variables


In [7]:
df.drop(["activation_date","image"],axis=1,inplace=True)

## Label encoding of categorical features

In [34]:
print("\nEncode Variables")
categorical = ["user_id","region","city","parent_category_name","category_name","user_type","image_top_1","param_1","param_2","param_3"]
print("Encoding :",categorical)

# Encoder:
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col].fillna('Unknown')
    df[col] = lbl.fit_transform(df[col].astype(str))
    


Encode Variables
Encoding : ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3']


## Text feature engineering

In [9]:
print("\nText Features")

# Feature Engineering 

# Meta Text Features
textfeats = ["description", "title"]
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    #df[cols + '_num_char'] = df[cols].apply(lambda comment: len(str(comment)))
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    df[cols + '_num_letters'] = df[cols].apply(lambda comment: len(comment)) # Count number of Letters
    df[cols + '_num_alphabets'] = df[cols].apply(lambda comment: (comment.count(r'[a-zA-Z]'))) # Count number of Alphabets
    df[cols + '_num_alphanumeric'] = df[cols].apply(lambda comment: (comment.count(r'[A-Za-z0-9]'))) # Count number of AlphaNumeric
    df[cols + '_num_digits'] = df[cols].apply(lambda comment: (comment.count('[0-9]'))) # Count number of Digits
    
# Extra Feature Engineering
df['avg_len_words_title'] = df['title_num_letters'] / df['title_num_words']
df['avg_len_words_desc'] = df['description_num_letters'] / df['description_num_words']
df['title_desc_len_ratio'] = df['title_num_letters']/df['description_num_letters']


Text Features


In [10]:
df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,...,title_num_words,title_num_unique_words,title_words_vs_unique,title_num_letters,title_num_alphabets,title_num_alphanumeric,title_num_digits,avg_len_words_title,avg_len_words_desc,title_desc_len_ratio
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,884270,19,462,4,42,249,112,1217,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",...,3,3,100.0,21,0,0,0,7.0,8.285714,0.362069
2dac0150717d,227908,17,1314,2,22,122,112,1217,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",...,3,3,100.0,17,0,0,0,5.666667,5.857143,0.414634
ba83aefab5dc,576261,16,1290,0,2,84,112,1217,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",...,2,2,100.0,14,0,0,0,7.0,5.823529,0.141414
02996f1dd2ea,755087,21,950,4,42,38,112,1217,автокресло,продам кресло от0-25кг,...,1,1,100.0,10,0,0,0,10.0,7.333333,0.454545
7c90be56d2ab,944363,4,318,6,0,278,124,46,"ваз 2110, 2003",все вопросы по телефону.,...,3,3,100.0,14,0,0,0,4.666667,6.0,0.583333


## TD-IDF

In [10]:
print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}


def get_col(col_name): return lambda x: x[col_name]
##I added to the max_features of the description. It did not change my score much but it may be worth investigating
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=17000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('title',CountVectorizer(
            ngram_range=(1, 2),
            stop_words = russian_stop,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()

#Fit my vectorizer on the entire dataset instead of the training rows
#Score improved by .0001
vectorizer.fit(df.to_dict('records'))

ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
textfeats = ["description", "title"]
df.drop(textfeats, axis=1,inplace=True)


[TF-IDF] Term Frequency Inverse Document Frequency Stage
Vectorization Runtime: 0.04 Minutes


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [13]:
#sparse.save_npz("avito_ridge_final/ready_df.npz", ready_df)
ready_df = sparse.load_npz("avito_ridge_2219/ready_df.npz")

In [14]:
ready_df

<2011862x1430760 sparse matrix of type '<class 'numpy.float64'>'
	with 48687245 stored elements in Compressed Sparse Row format>

In [22]:
df.drop(['description', 'title'], axis=1, inplace=True)

## Ridge 

In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt

ridge_params = {'alpha':30.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':SEED}

#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
ridge = SklearnWrapper(clf=Ridge, seed = SEED, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y, ready_df[ntrain:])


Fold 0

Fold 1

Fold 2

Fold 3

Fold 4


In [16]:
rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

Ridge OOF RMSE: 0.23033890453576883


In [17]:
print("Modeling Stage")

ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])

df['ridge_preds'] = ridge_preds

Modeling Stage


## Build SVD

In [None]:
# df.to_pickle("catboost_df_250618.pkl")
#df = pd.read_pickle("avito_ridge_2219/df_avito_ridge_2219.pkl")

In [18]:
%%time
### SVD Components ###

n_comp = 10
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(ready_df)

Wall time: 1min 4s


In [19]:
# The variance of the training samples transformed by a projection to each component.
print(svd_obj.explained_variance_ratio_)

[0.03229687 0.01412938 0.00876879 0.00767534 0.0061171  0.00573157
 0.00469193 0.00395342 0.00355947 0.00336352]


In [20]:
# Percentage of variance explained by each of the selected components.
print(svd_obj.explained_variance_ratio_.sum())

0.09028738466667074


In [21]:
full_svd = pd.DataFrame(svd_obj.transform(ready_df))
full_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]

In [23]:
#full_svd.to_pickle("catboost_final/full_svd_10.pkl")
full_svd = pd.read_pickle("catboost_final/full_svd_10.pkl")

In [23]:
full_svd.head()

Unnamed: 0,svd_title_1,svd_title_2,svd_title_3,svd_title_4,svd_title_5,svd_title_6,svd_title_7,svd_title_8,svd_title_9,svd_title_10
0,0.000216,0.000158,0.002612,0.005279,0.001896,0.000215,0.006193,0.000535,0.004482,-0.003907
1,0.000223,3.3e-05,0.002005,0.001609,0.000503,0.000138,0.002931,0.0005,0.001759,-0.002339
2,0.003949,0.00159,0.009981,0.022516,0.006024,0.002404,0.031217,0.00023,0.019422,-0.015721
3,0.002847,0.002523,0.008241,0.023886,0.000561,0.003839,0.008381,-0.000661,0.007033,-0.007617
4,0.005443,0.006585,0.004811,0.015149,0.003455,0.005731,0.016032,-0.000956,0.019447,-0.021726


In [25]:
df = df.reset_index()

df = pd.concat([df, full_svd], axis=1)

In [26]:
df = df.set_index('item_id')

In [27]:
df.shape

(2011862, 45)

In [28]:
df.head(5)

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,...,svd_title_1,svd_title_2,svd_title_3,svd_title_4,svd_title_5,svd_title_6,svd_title_7,svd_title_8,svd_title_9,svd_title_10
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,884270,19,462,4,42,249,112,1217,5.991467,2,...,0.000216,0.000158,0.002612,0.005279,0.001896,0.000215,0.006193,0.000535,0.004482,-0.003907
2dac0150717d,227908,17,1314,2,22,122,112,1217,8.006368,19,...,0.000223,3.3e-05,0.002005,0.001609,0.000503,0.000138,0.002931,0.0005,0.001759,-0.002339
ba83aefab5dc,576261,16,1290,0,2,84,112,1217,8.29405,9,...,0.003949,0.00159,0.009981,0.022516,0.006024,0.002404,0.031217,0.00023,0.019422,-0.015721
02996f1dd2ea,755087,21,950,4,42,38,112,1217,7.696213,286,...,0.002847,0.002523,0.008241,0.023886,0.000561,0.003839,0.008381,-0.000661,0.007033,-0.007617
7c90be56d2ab,944363,4,318,6,0,278,124,46,10.596635,3,...,0.005443,0.006585,0.004811,0.015149,0.003455,0.005731,0.016032,-0.000956,0.019447,-0.021726


In [30]:
del full_svd, ready_df
gc.collect()

695

In [29]:
df.dtypes

user_id                           int64
region                            int64
city                              int64
parent_category_name              int64
category_name                     int64
param_1                           int64
param_2                           int64
param_3                           int64
price                           float64
item_seq_number                   int64
user_type                         int64
image_top_1                       int64
avg_days_up_user                float64
avg_times_up_user               float64
n_user_items                    float64
Weekday                           int64
desc_punc                         int64
description_num_words             int64
description_num_unique_words      int64
description_words_vs_unique     float64
description_num_letters           int64
description_num_alphabets         int64
description_num_alphanumeric      int64
description_num_digits            int64
title_num_words                   int64


## Split it back to train and test

In [32]:
X = df.loc[traindex,:].copy()
print("Training Set shape",X.shape)
test = df.loc[testdex,:].copy()
print("Submission Set Shape: {} Rows, {} Columns".format(*test.shape))
#del df
gc.collect()

Training Set shape (1503424, 45)
Submission Set Shape: 508438 Rows, 45 Columns


631

## Get categorical features indices

In [35]:
# Prepare Categorical Variables
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
categorical_features_pos = column_index(X,categorical)
categorical_features_pos

array([ 0,  1,  2,  3,  4, 10, 11,  5,  6,  7], dtype=int64)

In [36]:
df.dtypes

user_id                           int64
region                            int64
city                              int64
parent_category_name              int64
category_name                     int64
param_1                           int64
param_2                           int64
param_3                           int64
price                           float64
item_seq_number                   int64
user_type                         int64
image_top_1                       int64
avg_days_up_user                float64
avg_times_up_user               float64
n_user_items                    float64
Weekday                           int64
desc_punc                         int64
description_num_words             int64
description_num_unique_words      int64
description_words_vs_unique     float64
description_num_letters           int64
description_num_alphabets         int64
description_num_alphanumeric      int64
description_num_digits            int64
title_num_words                   int64


## Save/load dataframes

In [8]:
categorical_features_pos = np.array([ 0,  1,  2,  3,  4, 10, 11,  5,  6,  7], dtype="i8")
categorical_features_pos

array([ 0,  1,  2,  3,  4, 10, 11,  5,  6,  7], dtype=int64)

In [40]:
df.to_pickle("catboost_final/catboost_df_clean.pkl")
#df = pd.read_pickle("catboost_final/catboost_df_clean.pkl")

In [3]:
## Save and load for train-test sets
from scipy import sparse

#X.to_pickle("catboost_final/catboost_X.pkl")
X = pd.read_pickle("catboost_final/catboost_X.pkl")

#test.to_pickle("catboost_final/catboost_test.pkl")
test = pd.read_pickle("catboost_final/catboost_test.pkl")

#y.to_pickle('catboost_final/y.pkl')    #to save the dataframe, df to 123.pkl
y = pd.read_pickle('catboost_final/y.pkl')

In [4]:
# Training and Validation Set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=2018)

## Modeling

In [5]:
cb_model = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=12,
                             eval_metric='RMSE',
                             random_seed = 23,
                             bagging_temperature = 0.2, #The higher the temperature the more aggressive bagging is.
                             od_type='Iter',
                             od_wait=50,
                             metric_period = 50,
                             save_snapshot = True
                             )

print ("Catboost's paramters are set!")

Catboost's paramters are set!


In [6]:
Valid = True

In [None]:
%%time
if VALID == True:    
    cb_model.fit(X_train, 
                 y_train,
                 eval_set=(X_valid,y_valid),
                 cat_features=categorical_features_pos,
                 use_best_model=True,
                 verbose=True)

else:
    cb_model = cb_model.fit(X,
                            y,
                            cat_features=categorical_features_pos
                            )

In [13]:
from sklearn.externals import joblib
# save model
joblib.dump(cb_model, 'catboost_final/cb_model.pkl')
# load model
#gbm_pickle = joblib.load('lgb.pkl')

['catboost_final/cb_model.pkl']

## Feature Importance

In [None]:
# Feature Importance
fea_imp = pd.DataFrame({'imp': cb_model.feature_importances_, 'col': X.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
_ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))
#plt.savefig('catboost_feature_importance.png')   
#fea_imp
%matplotlib inline
fea_imp

## Model Evaluation Stage

In [81]:
print("Model Evaluation Stage")
print(cb_model.get_params())
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, cb_model.predict(X_valid))))

Model Evaluation Stage
{'save_snapshot': True, 'bagging_temperature': 0.2, 'eval_metric': 'RMSE', 'metric_period': 1, 'random_seed': 23, 'od_type': 'Iter', 'od_wait': 50, 'loss_function': 'RMSE', 'depth': 12, 'learning_rate': 0.2, 'iterations': 60}
RMSE: 0.2265019207057708


In [82]:
catpred = cb_model.predict(test)

In [44]:
catsub = pd.DataFrame(catpred,columns=["deal_probability"],index=testdex)
catsub['deal_probability'].clip(0.0, 1.0, inplace=True)
catsub.to_csv("catsub.csv",index=True,header=True) # Between 0 and 1
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

Model Evaluation Stage


