In [11]:
import pandas as pd
import sklearn
import numpy as np
import xgboost
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from functools import reduce
import random
from rake_nltk import Rake

In [12]:
test_df = pd.read_csv('to_predict.csv')
train_df = pd.read_csv('all_data.csv')

drop = ['color', 'colour', 'lining', 'lining_material',
             'main_colour', 'model', 'n_sold', 
             'platform_height', 'shoe_size', 'shoe_width', 
             'size', 'vintage', 'year_of_manufacture']

### Feature Engineering


In [13]:
# converting location column to country only
def extract_country(row):
    if str(row.location) != 'other':
        row.location = row.location.split(",")[-1]
    return row

train_df = train_df.applymap(lambda x: x.lower() if type(x) == str else x)
test_df = test_df.applymap(lambda x: x.lower() if type(x) == str else x)

train_df.location = train_df.location.fillna('other')
train_df = train_df.apply(extract_country, axis=1)

test_df.location = test_df.location.fillna('other')
test_df = test_df.apply(extract_country, axis=1)

train_df.brand = train_df.brand.fillna('unbranded')
test_df.brand = test_df.brand.fillna('unbranded')

# chars = ['(',')','[',']','.',':']
# for c in chars:
#     train_df.replace(c,'',inplace=True)
#     test_df.replace(c,'',inplace=True)
    


Split material, and title column to several columns where each column represent the word found at the i'th place in the string occured on material/title

In [14]:
def split_by_and_add_cols(df, col_to_split):
    new_df = (df[col_to_split].str.split(' ', expand=True)
            .rename(columns=lambda x: f"{col_to_split}_{x+1}"))
    new_df = new_df.loc[:, new_df.isnull().mean() < .9]
    new_df = new_df.fillna(f"{col_to_split}_other")
    df = pd.merge(left=df, left_index=True,
                  right=new_df, right_index=True,
                  how='inner')
    df.drop(col_to_split,axis=1,inplace=True)
    return df

train_df = split_by_and_add_cols(train_df,'material')
train_df = split_by_and_add_cols(train_df,'title')
test_df = split_by_and_add_cols(test_df,'material')
test_df = split_by_and_add_cols(test_df,'title')
print('train data frame columns : \n', train_df.columns)
print('test data frame columns : \n', test_df.columns)

train data frame columns : 
 Index(['id', 'price', 'brand', 'style', 'heel_type', 'heel_height', 'width',
       'shoe_width', 'occasion', 'country_region_of_manufacture',
       'lining_material', 'upper_material', 'shoe_size', 'toe_shape', 'model',
       'year_of_manufacture', 'size', 'colour', 'color', 'main_colour',
       'lining', 'sole', 'vintage', 'closure', 'pattern', 'theme', 'fastening',
       'platform_height', 'location', 'n_sold', 'n_watchers', 'free_shipping',
       'longtime_member', 'same_day_shipping', 'fast_safe_shipping', 'returns',
       'feedback', 'condition', 'seller_notes', 'category', 'material_1',
       'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6',
       'title_7', 'title_8', 'title_9', 'title_10', 'title_11', 'title_12',
       'title_13', 'title_14'],
      dtype='object')
test data frame columns : 
 Index(['id', 'brand', 'style', 'heel_type', 'heel_height', 'width',
       'shoe_width', 'occasion', 'country_region_of_manufacture',

Using NLTK library to extract the most frequent keywords, and replace in a categorical column.

** Eventually I decided not using it as it didn't improved the score

In [None]:
# replaces with the first most frequent word
def assign_key_words(row, lst, col, nan_name):
    first = True
    if row[col] != nan_name:
        copy = row[col]
        for x in lst:
            if x[0] in copy and first:
                row[col] = x[0]
                first = False
            elif x[0] in copy:
                row[col] += f" {x[0]}"
    else:
        row[col] = nan_name
    return row


def use_nltk(df, cols):
    for col in cols:
        df[col] = df[col].fillna("other")
        all_d = list(df[col])
        all_d_string = "\n".join(all_d)
        r = Rake()
        r.extract_keywords_from_text(all_d_string)
        lst = []
        for x in r.get_word_frequency_distribution().most_common():
            if x[1] > df.shape[0] * 0.005:
                lst.append(x)
        df = df.apply(assign_key_words,axis=1,args=(lst, col, "other"))
    return df

#train_df = use_nltk(train_df,['occasion'])
#train_df.occasion.unique()
#train_df = split_by_and_add_cols(train_df,'occasion')
#train_df.columns
        

Noticing for example that for Birkenstock there are > 15 sheos in the data , but only 2 of them has country of manufacture which is Germany, those I filled for every shoe brand it's country_of_manufcature based on other sheos with same brand that there is country_of_manufacture, and if there were several manufacturing area's then I picked the most common.

In [15]:
def handle_country_of_manufacture(df):
    df.country_region_of_manufacture = df.country_region_of_manufacture.fillna('unknown')
    origin_df = df
    df = df[['brand','country_region_of_manufacture']]
    df = df[~df['country_region_of_manufacture'].str.contains('^unknown', case=False)]
    brand_groups = df.groupby(by='brand')
    for grp in brand_groups:
        curr_brand = grp[0]
        inner = grp[1].groupby(by='country_region_of_manufacture').count().\
        sort_values(by='brand',ascending=False).reset_index()
        country = inner.iloc[0]['country_region_of_manufacture']
        origin_df.loc[origin_df['brand'] == curr_brand,'country_region_of_manufacture'] = country
    return origin_df
train_df = handle_country_of_manufacture(train_df)
test_df = handle_country_of_manufacture(test_df)

### Label Encoder

In [17]:
le = LabelEncoder()
encode_df = pd.concat([train_df.drop('price',axis=1),test_df])
encode_vals = encode_df.drop(drop, axis=1).select_dtypes(exclude=np.number).fillna('other').stack().values
le.fit(encode_vals)

LabelEncoder()

### Preprocess and fit the train

In [18]:
train_df_dropped = train_df.drop(drop, axis=1)
df = train_df_dropped.select_dtypes(exclude=np.number).fillna('other')
train_df = pd.concat([df.apply(le.transform), train_df_dropped.select_dtypes(include=np.number).fillna(2)],axis=1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['price','id'],axis=1), train_df.price,
                                                    test_size=0.2, random_state=30)

### Tuning Parameters

In [None]:
xgb_reg = xgboost.XGBRegressor(n_jobs=-1)
parameters = {'nthread':[4], 
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05, .07], 
              'max_depth': [4, 5, 6, 7,9,11],
              'colsample_bytree': [0.4, 0.6, 0.8, 1.0],
              'min_child_weight': [1, 5, 10],
              'subsample': [0.7, 1.0],
              'colsample_bytree': [0.7],
              'n_estimators': [400 ,500,800,1000]}

xgb_grid = GridSearchCV(xgb_reg,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(X_train, np.log(y_train))

In [None]:
import pickle
xgb_best = xgb_grid.best_estimator_
# Save model
#xgb_best = pickle.load(open("xgb_log_reg.pickle",'rb'))

#print(xgb_best.get_xgb_params())
# fit and calcualte rmse
#xgb_best.fit(X_train, np.log(y_train))
#np.sqrt(sklearn.metrics.mean_squared_error(np.log(y_test),xgb_best.predict(X_test)))

Taking Best parameters and run the model using them

In [20]:
# Those are the parameters recieved by the gridsearch cv
parameters = {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 
     'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.7,
     'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.03, 
     'max_delta_step': 0, 'max_depth': 11, 'min_child_weight': 5, 
     'monotone_constraints': '()', 'n_jobs': -1, 'num_parallel_tree': 5, 
     'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1.0,
     'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'nthread': 4,
             'n_estimators': 1000}

xgb_reg = xgboost.XGBRegressor(**parameters)
xgb_reg.fit(X_train, np.log(y_train))
np.sqrt(sklearn.metrics.mean_squared_error(np.log(y_test),xgb_reg.predict(X_test)))

0.4805672854156389

Feature Importance : 

In [21]:
# a glance at the feature importance
pd.DataFrame(zip(xgb_reg.feature_importances_,X_train.columns)).sort_values(0,ascending=False)

Unnamed: 0,0,1
14,0.318223,location
9,0.083241,sole
15,0.078643,condition
6,0.040107,country_region_of_manufacture
17,0.037677,category
36,0.032413,same_day_shipping
34,0.031364,free_shipping
37,0.028079,fast_safe_shipping
0,0.024834,brand
38,0.022443,returns


### Predict on test and save it

In [None]:
test_df_dropped = test_df.drop(drop, axis=1)
df = test_df_dropped.select_dtypes(exclude=np.number).fillna('other')
test_df = pd.concat([df.apply(le.transform), test_df_dropped
                    .select_dtypes(include=np.number).fillna(2)],axis=1)
# train model
xgb_reg = xgboost.XGBRegressor(**parameters)
xgb_reg.fit(train_df.drop(['price','id'], axis=1),np.log(train_df['price']))
# save test predictions to model01.csv using xgb_best hyper parameters
pd.DataFrame(zip(test_df.id, xgb_reg.predict(test_df.drop('id',axis=1))),
             columns=['id','price_pred']).to_csv('model05.csv',index=False)

### Utils

In [None]:
# Find subset of columns that works good with the model
def powerset(lst):
    return reduce(lambda result, x: result + [subset + [x] for subset in result], lst, [[]])
all_subsets = powerset(drop_cols)


def try_all():
    m = 10
    s = []
    t_df = pd.read_csv('real_test.csv')
    tr_df = pd.read_csv('all_train_data.csv')
    
    drop_cols = ['closure','color','colour','fastening','lining','lining_material','main_colour','model','n_sold',
              'n_watchers','platform_height','seller_notes','shoe_size','shoe_width','size','sole','theme',
             'title','vintage','width','year_of_manufacture','style']
    
    tr_df = tr_df.applymap(lambda x: x.lower() if type(x) == str else x)
    #train_df = train_df.applymap(lambda x: x.strip() if type(x) == str else x)

    t_df = t_df.applymap(lambda x: x.lower() if type(x) == str else x)
    #test_df = test_df.applymap(lambda x: x.strip() if type(x) == str else x)

    tr_df.location = tr_df.location.fillna('other')
    tr_df = tr_df.apply(extract_country, axis=1)

    t_df.location = t_df.location.fillna('other')
    t_df = t_df.apply(extract_country, axis=1)
    
    cols = powerset(drop_cols)
    arr = random.sample(cols,1000)
    for i,drop_cols in enumerate(arr):
        test_df = t_df
        train_df = tr_df
        le = LabelEncoder()
        encode_df = pd.concat([train_df.drop('price',axis=1),test_df])
        encode_vals = encode_df.drop(drop_cols, axis=1).select_dtypes(exclude=np.number).fillna('other').stack().values
        le.fit(encode_vals)
        train_df_dropped = train_df.drop(drop_cols, axis=1)
        df = train_df_dropped.select_dtypes(exclude=np.number).fillna('other')
        train_df = pd.concat([df.apply(le.transform), train_df_dropped.select_dtypes(include=np.number)
                              .fillna(2)],axis=1)
        X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['price','id'],axis=1), train_df.price,
                                                        test_size=0.2, random_state=42)
        #xgb_clf = xgboost.XGBRegressor(n_jobs=-1)
        #xgb_clf.fit(X_train, np.log(y_train))
        xgb_best.fit(X_train, np.log(y_train))
        p = np.sqrt(sklearn.metrics.mean_squared_error(np.log(y_test),xgb_best.predict(X_test)))
        if p < m:
            m = p
            s = drop_cols
            print(m, s)
        if i % 10 == 0:
            print(i)
    return m,s

In [None]:
m,s = try_all()