In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import pipeline,preprocessing,feature_extraction,metrics

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('russian')

from sklearn.base import BaseEstimator 
from sklearn.base import TransformerMixin

from sklearn.pipeline  import FeatureUnion
from sklearn.pipeline  import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer as TfIdf

from sklearn.model_selection import ShuffleSplit
import xgboost as xgb
import string

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("~/Kaggle/avito/train.csv")
test_df = pd.read_csv("~/Kaggle/avito/test.csv")

In [79]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

def count_punctuation(aStr):
    return count(aStr,set(string.punctuation))


def reduce_category(seriesOld, min_percentage=0.03, new_cat='other'):
    series = seriesOld.copy()
    value_counts = series.value_counts()
    count = series.count()
    min_count = count * min_percentage

    select = series.apply(lambda val: not pd.isnull(val) and value_counts.get(val) < min_count)
    series.loc[select] = new_cat
    return series

def preprocess(dfOld: pd.DataFrame) -> pd.DataFrame:
    df = dfOld.copy()
    
    #df['description'] = dfOld['description'].replace(r"[.,\/#!$%\^&\*;:{}=\-_`~()]", " ")
    #df['city'] = dfOld['city'].replace(r"[.,\/#!$%\^&\*;:{}=\-_`~()]", " ")
    
     # isna feature
    #print(pd.isna(dfOld['param_3']))
    df['param_3_na'] = pd.isna(dfOld['param_3']).astype(np.float16)
    df['param_2_na'] = pd.isna(dfOld['param_2']).astype(np.float16)
    df['description_na'] = pd.isna(dfOld['description']).astype(np.float16)
    df['image_na'] = pd.isna(dfOld['image']).astype(np.float16)
    
    # description and title statistics
    df['description_len'] = dfOld['description'].fillna('').map(lambda x: len(str(x))).astype(np.float16) #Length
    df['description_wc'] = dfOld['description'].fillna('').map(lambda x: len(str(x).split(' '))).astype(np.float16) #Word Count
    df['description_punc'] = dfOld['description'].fillna('').map(lambda x: count_punctuation(x)).astype(np.float16) #punctuation Count
    df['description_sw'] = dfOld['description'].fillna('').map(lambda x: str(x).split(' ')).map(lambda aStr: count(aStr,set(sw))).astype(np.float16) # stopwords
    
    df['title_len'] = dfOld['title'].fillna('').map(lambda x: len(str(x))).astype(np.float16) #Lenth
    df['title_wc'] = dfOld['title'].fillna('').map(lambda x: len(str(x).split(' '))).astype(np.float16) #Word Count
    df['title_punc'] = dfOld['title'].fillna('').map(lambda x: count_punctuation(x)).astype(np.float16) #punctuation Count
    
    # description feature for tdidf
    df['description'] = (dfOld['title'].fillna('') + ' ' + dfOld['description'].fillna(''))
    df['description'] = df['description'].str.lower().replace(r"[^[:alpha:]]", " ")
    df['description'] = df['description'].str.replace(r"\\s+", " ")
    
    #reduce category
    df['param_3'] = reduce_category(df['param_3'], min_percentage=0.00005, new_cat='other_param')
    df['city'] = reduce_category(dfOld['city'], min_percentage=0.0003, new_cat='other_city')
    df['user_id'] = reduce_category(dfOld['user_id'], min_percentage=0.000025, new_cat='other_user')
    
    df['image'] = df['image'].fillna('').map(lambda x: 1 if len(str(x))>0 else 0)
    df['price'] = np.log1p(df['price'].fillna(0))
    df['image_top_1'] = np.log1p(df['image_top_1'].fillna(0))
    
    # extract activation date
    df['activation_date_wday'] = pd.to_datetime(df['activation_date']).dt.dayofweek
    df['activation_date_day'] = pd.to_datetime(df['activation_date']).dt.day
    df['activation_date_week'] = pd.to_datetime(df['activation_date']).dt.week
    
    #ex_col = ['item_id', 'user_id', 'deal_probability', 'title', 'param_1', 'param_2', 'param_3', 'activation_date', 'city']
    ex_col = ['item_id', 'deal_probability', 'title', 'activation_date']
    col = [c for c in df.columns if c not in ex_col]
    return df[col]

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #assert isinstance(X, pd.DataFrame)
        return X[self.key]
    
class FeatureInfo(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #print('FeatureInfo ' + str(type(X)))
        return self
    def transform(self, X):
        print('FeatureInfo ' + str(X.shape) + str(type(X)))
        return X
    
class ArrayCaster(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        #print(data.shape)
        #print(np.transpose(np.matrix(data)).shape)
        return np.transpose(np.matrix(data))
    
class NaFiller(BaseEstimator, TransformerMixin):
    def __init__(self, filler):
        self.filler = filler
        
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return pd.Series(data).fillna(-1)

class MyLabelEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        unique = pd.Series(X).unique()
        self.unique_values  = unique[pd.notnull(unique)].tolist()
        #print(self.unique_values)
        return self
    def transform(self, dataOld):
        data = pd.Series(dataOld)
        mapper = lambda datum: self.unique_values.index(datum) if datum in self.unique_values else -1
        return data.map(mapper)
    
class LabelEncoders(BaseEstimator, TransformerMixin):
    def __init__(self, id):
        self.mappings = {}
        self.id = id
    def fit(self, X, y=None):
        name = X.name
        #print("MyLabelEncoders " + name + self.id)
        if (name not in self.mappings):
            self.mappings[name] = MyLabelEncoder()
        self.mappings[name].fit(X)
        return self
    def transform(self, X):
        result = self.mappings[X.name].transform(X)
        #print("LabelEncoders" + str(result.shape))
        return result
    
def on_field(key:str, *transformers) -> Pipeline:
    additional_steps = [ ("step" + str(ii), transformers[ii]) for ii in range(len(transformers)) ]
    steps =  [('selection', FeatureSelector(key=key)) ] + additional_steps
    print(steps)
    return Pipeline(steps)

def on_multiple_fields(keys, *transformers) -> Pipeline:
    return FeatureUnion(
        transformer_list=[
            ("pipeline" + str(keys[ii]), on_field(keys[ii], *transformers)) for ii in range(len(keys))
        ]
    )

<h1>Big data experiment</h1>

In [80]:


# shuffle
def split_train_valid_dataset(old_train_df, dealProb):
    ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=37)
    train_index, test_index = next(ss.split(train_df))

    train_X = old_train_df.iloc[train_index]
    train_Y = dealProb[train_index]

    valid_X = old_train_df.iloc[test_index]
    valid_Y = dealProb[test_index]
    
    return (train_X, train_Y, valid_X, valid_Y)


train_df_prep = preprocess(train_df)
deal_prob = train_df['deal_probability']
(train_X, train_Y, valid_X, valid_Y) = split_train_valid_dataset(train_df_prep, deal_prob)
#train_X_prep = preprocess(train_X)
#train_Y_prep = preprocess(train_Y)

In [81]:
train_X.iloc[1]

user_id                                                      d9b494f97062
region                                                Воронежская область
city                                                              Воронеж
parent_category_name                                          Личные вещи
category_name                                  Товары для детей и игрушки
param_1                                                           Игрушки
param_2                                                               NaN
param_3                                                               NaN
description             пинки пай my little pony пинки пай ждёт любящу...
price                                                             7.31389
item_seq_number                                                       309
user_type                                                         Company
image                                                                   1
image_top_1                           

In [82]:
vectorizer = pipeline.make_pipeline(
                pipeline.make_union(
                    on_field('description', TfIdf(max_features=3500, stop_words=sw, token_pattern='\w+', norm='l2', min_df=3, max_df=0.3, sublinear_tf=True, ngram_range=(1, 3))),
                    on_multiple_fields(['region', 'parent_category_name', 'category_name', 'user_type', 'param_3', 'param_2', 'param_1', 'city', 'user_id'], LabelEncoders(id="same"), ArrayCaster()),
                    on_multiple_fields(['item_seq_number', 'param_3_na', 'param_2_na', 'description_na', 'image_na', 'description_len', 'description_wc', 'description_punc', 'description_sw', 'title_len',  'title_wc', 'title_punc', 'price', 'image_top_1', 'activation_date_wday', 'activation_date_day', 'activation_date_week'], NaFiller(-1), ArrayCaster())
                ),
                #RandomForestRegressor()
)

vectorizer.fit(train_X)
train_X_transform = vectorizer.transform(train_X)
valid_X_transform = vectorizer.transform(valid_X)


[('selection', FeatureSelector(key='description')), ('step0', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=3500, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', '...гда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между'],
        strip_accents=None, sublinear_tf=True, token_pattern='\\w+',
        tokenizer=None, use_idf=True, vocabulary=None))]
[('selection', FeatureSelector(key='region')), ('step0', LabelEncoders(id='same')), ('step1', ArrayCaster())]
[('sele

<h2>Test with randomforestregressor</h2>

<h2>Test with xgboost </h2>

In [83]:
from xgboost import XGBRegressor
import xgboost

train_dmatrix = xgboost.DMatrix(data=train_X_transform, label=train_Y)
valid_dmatrix = xgboost.DMatrix(data=valid_X_transform, label=valid_Y)


params = {
    'objective': 'reg:logistic',
    'booster': 'gbtree',
    'eval': 'rmse',
    'eta': 0.05,
    'max_depth': 18,
    'min_child_weight': 11,
    'gamma': 0,
    'subsample': 0.8,
    'alpha': 0,
    'lambda': 2,
    'nrounds': 8000,
    'colsample_bytree': 0.7
}

#xgbclf = xgboost.train(params, train_dmatrix, early_stopping_rounds=50)
xgbclf = XGBRegressor(n_jobs=16, 
                      silent=False,
                      seed=37,
                      **params
                     )
xgbclf.fit(train_X_transform, train_Y)

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eta=0.05, eval='rmse', gamma=0, lambda=2,
       learning_rate=0.1, max_delta_step=0, max_depth=18,
       min_child_weight=11, missing=None, n_estimators=100, n_jobs=16,
       nrounds=8000, nthread=None, objective='reg:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=37, silent=False, subsample=0.8)

In [84]:
from sklearn.metrics import mean_squared_error as mse

def count_mse(xgbclf, valid_X, valid_Y):
    valid_X_pred = xgbclf.predict(valid_X)
    error = mse(valid_Y, valid_X_pred)
    print(error)
    return error
count_mse(xgbclf, valid_X_transform, valid_Y)

0.20119690964


0.20119690964

<h2>Test with xgboost </h2>


In [36]:
import scipy.sparse as sp

xgbclf2 = XGBRegressor(n_jobs=16, silent=False)
all_X =  sp.vstack((train_X_t, test_X_t))
all_Y = sp.hstack((train_Y, test_Y)).T
all_X.shape

(1503424, 7015)

In [37]:
print(all_Y.shape)
xgbclf2.fit(all_X, all_Y)

(1503424, 1)


MemoryError: 

In [38]:
test_df_prep = preprocess(test_df)
test_df_prep_t = vectorizer.transform(test_df_prep)
test_df_prep_t_pred = xgbclf.predict(test_df_prep_t)

df_submission = pd.DataFrame()
df_submission['item_id'] = test_df['item_id']
df_submission['deal_probability'] = pd.Series(test_df_prep_t_pred)

df_submission.to_csv('submission.csv', index=False, index_label=False)


FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
LabelEncoders(508438,)
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
LabelEncoders(508438,)
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
LabelEncoders(508438,)
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
LabelEncoders(508438,)
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Series'>
FeatureInfo (508438,)<class 'pandas.core.series.Serie

In [41]:
test_df_prep_t_pred_series = pd.Series(test_df_prep_t_pred)
minn = test_df_prep_t_pred_series.min()
maxx = test_df_prep_t_pred_series.max()

In [42]:
test_df_prep_t_pred_series=test_df_prep_t_pred_series.apply(lambda x: (x-minn)/(maxx-minn))

In [43]:
test_df_prep_t_pred_series.describe()

count    508438.000000
mean          0.258402
std           0.136093
min           0.000000
25%           0.140774
50%           0.240963
75%           0.323774
max           1.000000
dtype: float64

In [44]:
df_submission = pd.DataFrame()
df_submission['item_id'] = test_df['item_id']
df_submission['deal_probability'] = test_df_prep_t_pred_series

df_submission.to_csv('submission.csv', index=False, index_label=False)


In [45]:
test_df_prep_t_pred_series.describe()

count    508438.000000
mean          0.258402
std           0.136093
min           0.000000
25%           0.140774
50%           0.240963
75%           0.323774
max           1.000000
dtype: float64

In [46]:
pd.Series(test_df_prep_t_pred).describe()

count    508438.000000
mean          0.146537
std           0.095903
min          -0.035600
25%           0.063625
50%           0.134243
75%           0.192612
max           0.669253
dtype: float64