In [1]:
# Catboost for Avito Demand Prediction Challenge
# https://www.kaggle.com/c/avito-demand-prediction
# By Nick Brooks, April 2018

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Viz
import seaborn as sns
import matplotlib.pyplot as plt

import json
config = json.load(open("config.json"))

In [2]:
print("\nData Load Stage")
training = pd.read_csv(config["train_csv"], index_col = "item_id", parse_dates = ["activation_date"])
traindex = training.index
testing = pd.read_csv(config["test_csv"], index_col = "item_id", parse_dates = ["activation_date"])
testdex = testing.index
y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

# Combine Train and Test
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))


Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns

All Data shape: 2011862 Rows, 16 Columns


In [3]:
print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday

Feature Engineering

Create Time Variables


In [4]:
print("\nEncode Variables")
categorical = ["region","city","parent_category_name","category_name","user_type","image_top_1", "param_1", "param_2", "param_3"]
text = ["description", "title"]
print("Encoding :",categorical)


Encode Variables
Encoding : ['region', 'city', 'parent_category_name', 'category_name', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3']


In [5]:
def extract_text_features_as_numeric(df, columns):
    for cols in columns:
        df[cols] = df[cols].astype(str)
        df[cols] = df[cols].fillna('NA')  # FILL NA
        df[cols] = df[cols].str.lower()  # Lowercase all text, so that capitalized words dont get treated differently
        df[cols + '_num_chars'] = df[cols].apply(len)  # Count number of Characters
        df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split()))  # Count number of Words
        df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
        df[cols + '_words_vs_unique'] = df[cols + '_num_unique_words'] / df[cols + '_num_words'] * 100  # Count Unique Words

    return df

In [6]:
df = extract_text_features_as_numeric(df, text)

In [7]:
# Remove text Variables
df.drop(text,axis=1,inplace=True)
# Remove Dead Variables
df.drop(["activation_date", "image", "user_id"],axis=1,inplace=True)

In [8]:
# Encoder:
max_size = 150000
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = df[col].apply(lambda x: hash(x)%max_size)

In [9]:
print("\nCatboost Modeling Stage")

X = df.loc[traindex,:].copy()
print("Training Set shape",X.shape)
test = df.loc[testdex,:].copy()
print("Submission Set Shape: {} Rows, {} Columns".format(*test.shape))
del df
gc.collect()


Catboost Modeling Stage
Training Set shape (1503424, 20)
Submission Set Shape: 508438 Rows, 20 Columns


146

In [10]:
# Training and Validation Set
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.10, random_state=23)

In [11]:
# Prepare Categorical Variables
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
categorical_features_pos = column_index(X,categorical)

In [12]:
# Train Model
print("Train CatBoost Decision Tree")
modelstart= time.time()
cb_model = CatBoostRegressor(iterations=900,
                             learning_rate=0.08,
                             depth=10,
                             #loss_function='RMSE',
                             eval_metric='RMSE',
                             # random_seed = 23, # reminder of my mortality
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

Train CatBoost Decision Tree


In [13]:
cb_model.fit(X_train, y_train,
             eval_set=(X_valid,y_valid),
             cat_features=categorical_features_pos,
             use_best_model=True, verbose=True)

0:	learn: 0.2867581	test: 0.2879154	best: 0.2879154 (0)	total: 2.37s	remaining: 35m 32s


KeyboardInterrupt: 