# Lightgbm Implementation Avito w/ target encoding

By: Traci

This notebook builds off several public kernels on kaggle. 

Instead of using label encoding of categorical features, I used target encoding (aka mean encoding).

Data can be downloaded from https://www.kaggle.com/c/avito-demand-prediction.

More information can be found in readme.

In [1]:
#Initially forked from Bojan's kernel here: https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2242/code
#improvement using kernel from Nick Brook's kernel here: https://www.kaggle.com/nicapotato/bow-meta-text-and-dense-features-lgbm
#Used oof method from Faron's kernel here: https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867
#Used some text cleaning method from Muhammad Alfiansyah's kernel here: https://www.kaggle.com/muhammadalfiansyah/push-the-lgbm-v19
#Forked From - https://www.kaggle.com/him4318/avito-lightgbm-with-ridge-feature-v-2-0

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
random.seed(2018)
print("Data:\n",os.listdir("data"))

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

NFOLDS = 5
SEED = 2018
VALID = True
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
        
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

Data:
 ['aggregated_features.csv', 'aggregated_features_v3.csv', 'aggregated_features_v5.csv', 'periods_test.csv', 'periods_train.csv', 'target_encoded.csv', 'test.csv', 'train.csv']




In [11]:
testing = pd.read_csv('data/test.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
testdex = testing.index

In [60]:
print("\nData Load Stage")
training = pd.read_csv('data/train.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
traindex = training.index
testing = pd.read_csv('data/test.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

y = training.deal_probability.copy()
#training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))


Data Load Stage
Train shape: 1503424 Rows, 17 Columns
Test shape: 508438 Rows, 16 Columns


In [61]:
print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
#del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

Combine Train and Test


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  



All Data shape: 2011862 Rows, 17 Columns


## Add in aggregated features

In [62]:
df = df.reset_index()
gp = pd.read_csv('data/aggregated_features.csv') 
df = df.merge(gp, on='user_id', how='left')
del gp
gc.collect()
#df = df.set_index('item_id')

42

In [63]:
print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(df.price.mean(),inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

df["avg_days_up_user"] = np.log(df["avg_days_up_user"]+0.001)
df["avg_days_up_user"].fillna(-999,inplace=True)
df["avg_times_up_user"] = np.log(df["avg_times_up_user"]+0.001)
df["avg_times_up_user"].fillna(-999,inplace=True)
df["n_user_items"] = np.log(df["n_user_items"]+0.001)
df["n_user_items"].fillna(-999,inplace=True)

Feature Engineering


  
  
  


In [64]:
print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday


Create Time Variables


In [65]:
df.drop(["activation_date","image"],axis=1,inplace=True)

## Text feature engineering

In [66]:
print("\nText Features")

# Feature Engineering 

# Meta Text Features
textfeats = ["description", "title"]
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    #df[cols + '_num_char'] = df[cols].apply(lambda comment: len(str(comment)))
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    df[cols + '_num_letters'] = df[cols].apply(lambda comment: len(comment)) # Count number of Letters
    df[cols + '_num_alphabets'] = df[cols].apply(lambda comment: (comment.count(r'[a-zA-Z]'))) # Count number of Alphabets
    df[cols + '_num_alphanumeric'] = df[cols].apply(lambda comment: (comment.count(r'[A-Za-z0-9]'))) # Count number of AlphaNumeric
    df[cols + '_num_digits'] = df[cols].apply(lambda comment: (comment.count('[0-9]'))) # Count number of Digits
    
# Extra Feature Engineering
df['avg_len_words_title'] = df['title_num_letters'] / df['title_num_words']
df['avg_len_words_desc'] = df['description_num_letters'] / df['description_num_words']
df['title_desc_len_ratio'] = df['title_num_letters']/df['description_num_letters']


Text Features


In [67]:
df.head()

Unnamed: 0,item_id,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,...,title_num_words,title_num_unique_words,title_words_vs_unique,title_num_letters,title_num_alphabets,title_num_alphanumeric,title_num_digits,avg_len_words_title,avg_len_words_desc,title_desc_len_ratio
0,b912c3c6a6ad,Товары для детей и игрушки,Екатеринбург,0.12789,"кокон для сна малыша,пользовались меньше месяц...",1008.0,2,Постельные принадлежности,,,...,3,3,100.0,21,0,0,0,7.0,8.285714,0.362069
1,2dac0150717d,Мебель и интерьер,Самара,0.0,"стойка для одежды, под вешалки. с бутика.",692.0,19,Другое,,,...,3,3,100.0,17,0,0,0,5.666667,5.857143,0.414634
2,ba83aefab5dc,Аудио и видео,Ростов-на-Дону,0.43177,"в хорошем состоянии, домашний кинотеатр с blu ...",3032.0,9,"Видео, DVD и Blu-ray плееры",,,...,2,2,100.0,14,0,0,0,7.0,5.823529,0.141414
3,02996f1dd2ea,Товары для детей и игрушки,Набережные Челны,0.80323,продам кресло от0-25кг,796.0,286,Автомобильные кресла,,,...,1,1,100.0,10,0,0,0,10.0,7.333333,0.454545
4,7c90be56d2ab,Автомобили,Волгоград,0.20797,все вопросы по телефону.,2264.0,3,С пробегом,ВАЗ (LADA),2110.0,...,3,3,100.0,14,0,0,0,4.666667,6.0,0.583333


## Target Encoding

In [68]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from itertools import product

class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='regression', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode

        :param n_splits: the number of splits used in mean encoding

        :param target_type: str, 'regression' or 'classification'

        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()

        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new

In [69]:
cat_vars_orig = [ 'user_id','region', 'city', 'parent_category_name', 'category_name', 
            'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3'
           ]

training = training.reset_index()
testing = testing.reset_index()

mean_encoder = MeanEncoder(categorical_features=cat_vars_orig, prior_weight_func={'k':5, 'f':1})

In [70]:
%%time
mean_encoded_train = mean_encoder.fit_transform(training, training['deal_probability'])

mean_encoded_test = mean_encoder.transform(testing)

is deprecated and will be removed in a future version


Wall time: 1min 53s


In [71]:
# Put target encoded features into main df
mean_coded_vars = list(set(mean_encoded_train.columns) - set(training.columns))

mean_coded_vars.append('item_id')
df = pd.merge(df, 
                     pd.concat([mean_encoded_train[mean_coded_vars], mean_encoded_test[mean_coded_vars]]),
                     how='left',
                     on='item_id'
                    )

In [72]:
df = df.set_index('item_id')

In [73]:
df.head()

Unnamed: 0_level_0,category_name,city,deal_probability,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,...,user_type_pred,user_id_pred,city_pred,parent_category_name_pred,region_pred,param_2_pred,image_top_1_pred,category_name_pred,param_3_pred,param_1_pred
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,Товары для детей и игрушки,Екатеринбург,0.12789,"кокон для сна малыша,пользовались меньше месяц...",1008.0,2,Постельные принадлежности,,,Личные вещи,...,0.149507,0.139056,0.123217,0.075798,0.121627,0.139056,0.082193,0.197845,0.139056,0.087689
2dac0150717d,Мебель и интерьер,Самара,0.0,"стойка для одежды, под вешалки. с бутика.",692.0,19,Другое,,,Для дома и дачи,...,0.149507,0.139056,0.140488,0.179256,0.137626,0.139056,0.171453,0.19164,0.139056,0.12602
ba83aefab5dc,Аудио и видео,Ростов-на-Дону,0.43177,"в хорошем состоянии, домашний кинотеатр с blu ...",3032.0,9,"Видео, DVD и Blu-ray плееры",,,Бытовая электроника,...,0.149507,0.140771,0.126286,0.175539,0.136941,0.139056,0.204237,0.17399,0.139056,0.124202
02996f1dd2ea,Товары для детей и игрушки,Набережные Челны,0.80323,продам кресло от0-25кг,796.0,286,Автомобильные кресла,,,Личные вещи,...,0.124325,0.139056,0.13583,0.075798,0.142291,0.139056,0.337333,0.197845,0.139056,0.33213
7c90be56d2ab,Автомобили,Волгоград,0.20797,все вопросы по телефону.,2264.0,3,С пробегом,ВАЗ (LADA),2110.0,Транспорт,...,0.149507,0.139056,0.136934,0.262835,0.145426,0.363538,0.319736,0.277869,0.374056,0.282412


In [74]:
del training, testing
gc.collect()

661

## Dropping original cat features

In [75]:
print("\nEncode Variables")
categorical = ["user_id","region","city","parent_category_name","category_name","user_type","image_top_1","param_1","param_2","param_3"]
print("Encoding :",categorical)

# # Encoder:
# lbl = preprocessing.LabelEncoder()
# for col in categorical:
#     df[col].fillna('Unknown')
#     df[col] = lbl.fit_transform(df[col].astype(str))
    


Encode Variables
Encoding : ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3']


In [76]:
df.drop(categorical, axis=1, inplace=True)

##  Term Frequency Inverse Document Frequency Stage

In [77]:
print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}


def get_col(col_name): return lambda x: x[col_name]
##I added to the max_features of the description. It did not change my score much but it may be worth investigating
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=17000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('title',CountVectorizer(
            ngram_range=(1, 2),
            stop_words = russian_stop,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()

#Fit my vectorizer on the entire dataset instead of the training rows
#Score improved by .0001
vectorizer.fit(df.to_dict('records'))

ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))


[TF-IDF] Term Frequency Inverse Document Frequency Stage
Vectorization Runtime: 12.82 Minutes


In [18]:
# Save/load sparse matrix contructed with tdidf

#sparse.save_npz("avito_ridge_final/ready_df.npz", ready_df)
ready_df = sparse.load_npz("avito_ridge_final/ready_df.npz")

In [78]:
# Drop Text Cols
textfeats = ["description", "title"]
df.drop(textfeats, axis=1,inplace=True)

In [79]:
ready_df

<2011862x1430760 sparse matrix of type '<class 'numpy.float64'>'
	with 48687245 stored elements in Compressed Sparse Row format>

## Ridge 

In [81]:
from sklearn.metrics import mean_squared_error
from math import sqrt

ridge_params = {'alpha':30.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':SEED}

#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
ridge = SklearnWrapper(clf=Ridge, seed = SEED, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y, ready_df[ntrain:])


Fold 0

Fold 1

Fold 2

Fold 3

Fold 4


In [82]:
rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

Ridge OOF RMSE: 0.23033890453576603


In [83]:
print("Modeling Stage")

ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])

df['ridge_preds'] = ridge_preds

Modeling Stage


In [84]:
df.drop('deal_probability', axis=1, inplace=True)

In [85]:
df.dtypes

item_seq_number                   int64
price                           float64
avg_days_up_user                float64
avg_times_up_user               float64
n_user_items                    float64
Weekday                           int64
desc_punc                         int64
description_num_words             int64
description_num_unique_words      int64
description_words_vs_unique     float64
description_num_letters           int64
description_num_alphabets         int64
description_num_alphanumeric      int64
description_num_digits            int64
title_num_words                   int64
title_num_unique_words            int64
title_words_vs_unique           float64
title_num_letters                 int64
title_num_alphabets               int64
title_num_alphanumeric            int64
title_num_digits                  int64
avg_len_words_title             float64
avg_len_words_desc              float64
title_desc_len_ratio            float64
user_type_pred                  float64


## Build Matrices

In [86]:
# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack([csr_matrix(df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]]) # Sparse Matrix
testing = hstack([csr_matrix(df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
for shape in [X,testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))
#del df
gc.collect();

1503424 Rows and 1430795 Cols
508438 Rows and 1430795 Cols
Feature Names Length:  1430795


## Save/load matrices

In [2]:
## Save and load for train-test sets
from scipy import sparse

#sparse.save_npz("avito_ridge_final/targetenc_X.npz", X)
X = sparse.load_npz("avito_ridge_final/targetenc_X.npz")
#sparse.save_npz("avito_ridge_final/targetenc_testing.npz", testing)
testing = sparse.load_npz("avito_ridge_final/targetenc_testing.npz")

#y.to_pickle('avito_ridge_final/targetenc_y.pkl')    #to save the dataframe, df to 123.pkl
y = pd.read_pickle('avito_ridge_final/y.pkl')

In [3]:
import pickle
#with open("avito_ridge_final/targetenc_tfvocab.txt", "wb") as fp:   #Pickling 
#    pickle.dump(tfvocab, fp)

with open("avito_ridge_final/targetenc_tfvocab.txt", "rb") as fp:   # Unpickling
    tfvocab = pickle.load(fp)

In [4]:
del tfvocab
gc.collect()

128

## Modeling

In [5]:
VALID = True

In [6]:
print("\nModeling Stage")

# del ridge_preds,vectorizer,ready_df
# gc.collect();
    
print("Light Gradient Boosting Regressor")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.016,
    'verbose': 0
}  


Modeling Stage
Light Gradient Boosting Regressor


In [7]:
%%time
if VALID == True:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.10, random_state=2018)
        
    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X_train, y_train#,
                    #feature_name=tfvocab,
                    #categorical_feature = categorical
                         )
    lgvalid = lgb.Dataset(X_valid, y_valid#,
                    #feature_name=tfvocab,
                    #categorical_feature = categorical
                         )
    #del X, X_train; gc.collect()
    
    # Go Go Go
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=20000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=100
    )
#     print("Model Evaluation Stage")
#     print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    #del X_valid ; gc.collect()

else:
    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X, y,
#                     feature_name=tfvocab,
#                     categorical_feature = categorical
                         )
    #del X; gc.collect()
    # Go Go Go
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=1537,
        verbose_eval=100
    )

Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222813	valid's rmse: 0.224003
[200]	train's rmse: 0.21726	valid's rmse: 0.21958
[300]	train's rmse: 0.214703	valid's rmse: 0.218283
[400]	train's rmse: 0.212768	valid's rmse: 0.217615
[500]	train's rmse: 0.211104	valid's rmse: 0.217161
[600]	train's rmse: 0.209709	valid's rmse: 0.216897
[700]	train's rmse: 0.208434	valid's rmse: 0.216722
[800]	train's rmse: 0.207273	valid's rmse: 0.216602
[900]	train's rmse: 0.206175	valid's rmse: 0.216511
[1000]	train's rmse: 0.20515	valid's rmse: 0.216462
[1100]	train's rmse: 0.204192	valid's rmse: 0.21643
[1200]	train's rmse: 0.203236	valid's rmse: 0.216405
[1300]	train's rmse: 0.202345	valid's rmse: 0.216384
[1400]	train's rmse: 0.201436	valid's rmse: 0.216367
[1500]	train's rmse: 0.200568	valid's rmse: 0.21635
Early stopping, best iteration is:
[1537]	train's rmse: 0.200251	valid's rmse: 0.216342
Wall time: 4h 10min 28s


In [8]:
from sklearn.externals import joblib
# save model
joblib.dump(lgb_clf, 'avito_ridge_final/lgb_clf_targetenc.pkl')
# load model
#gbm_pickle = joblib.load('lgb.pkl')

['avito_ridge_final/lgb_clf_targetenc.pkl']

In [35]:
lgb_clf

<lightgbm.basic.Booster at 0x1d6f559b080>

In [None]:
# # Feature Importance Plot
# f, ax = plt.subplots(figsize=[7,10])
# lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax)
# plt.title("Light GBM Feature Importance")
# plt.savefig('feature_import.png')
# f

In [9]:
print("Model Evaluation Stage")
lgpred = lgb_clf.predict(testing) 

Model Evaluation Stage




In [None]:
#Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
#blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]

In [12]:
lgsub = pd.DataFrame(lgpred,columns=["deal_probability"],index=testdex)
lgsub = lgsub['deal_probability'].clip(0.0, 1.0)
lgsub.to_csv("final_ridge_targetenc-216342-.csv",index=True,header=True)

In [None]:
#print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))