In [1]:
import numpy as np 
import pandas as pd
import os
import gc
import re

# feature libraries
from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing

In [2]:
data_dir = '../input/kaggle_data'
feat_dir = '../input/features'

In [3]:
used_cols = ['item_id','image_top_1','item_seq_number', 'price', 'region', 'city', 'param_3', 'param_2', 'param_1', 
             'category_name','parent_category_name']

train = pd.read_csv(f'{data_dir}/train.csv', index_col="item_id", usecols=used_cols)
test = pd.read_csv(f'{data_dir}/test.csv', index_col="item_id", usecols=used_cols)

train_idx = train.index
test_idx = test.index

df = pd.concat([train, test], axis=0)

del train, test
gc.collect()

14

In [4]:
df.columns[df.isna().any()].tolist()

['param_1', 'param_2', 'param_3', 'price', 'image_top_1']

In [5]:
# impute price
for c in ['price','image_top_1']:
    df[c+'_missing'] = 0
    df[c+'_missing'] = np.where(df[c].isnull(), 1, df[c+'_missing'])   
    
for c in ['param_1', 'param_2', 'param_3']:
    df[c].fillna(value='none', inplace=True)
    
cat_cols = ['region','city','parent_category_name','category_name','param_1','param_2','param_3']

for cols in cat_cols:
    df[cols] = df[cols].astype(str)
    df[cols] = df[cols].str.lower()
    df[cols] = df[cols].replace(to_replace=' +', value=' ', regex=True)
    
lbl = preprocessing.LabelEncoder()
for col in cat_cols:
    df[col] = lbl.fit_transform(df[col])
    
def get_wo_nan_price(df):
    df_wo_nan = pd.DataFrame(index=df.index)
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city','parent_category_name','category_name','param_1','param_2','param_3'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city','parent_category_name','category_name','param_1','param_2'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city','parent_category_name','category_name','param_1'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city','parent_category_name','category_name'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city','parent_category_name'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region','city'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    df_wo_nan['price_wo_nan'] = df.groupby(['region'])['price'].apply(lambda x: x.fillna(x.median()))
    print(df_wo_nan['price_wo_nan'].isnull().sum())
    return df_wo_nan

price_imputed = get_wo_nan_price(df)
df = df.merge(price_imputed, left_index=True, right_index=True, how='left')


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


13695
12308
8962
1989
584
7
0


In [15]:
df.drop(['region','city','parent_category_name','category_name','param_1','param_2','param_3','price',], axis=1, inplace=True)

In [7]:
for c in ['image_top_1','price_wo_nan','item_seq_number']:
    df[c].fillna(value=0, inplace=True)
    df[c] = (df[c] - np.mean(df[c]))/np.std(df[c])

In [8]:
df.rename(columns={'price_wo_nan': 'price'}, inplace=True)

In [9]:
feat_cols = list(df.columns)[0:]
feat_cols

['item_seq_number',
 'image_top_1',
 'price_missing',
 'image_top_1_missing',
 'price']

In [10]:
train_price = df.loc[train_idx,:]
train_price.head()

Unnamed: 0_level_0,item_seq_number,image_top_1,price_missing,image_top_1_missing,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b912c3c6a6ad,-0.136324,-0.152807,0,0,-0.005147
2dac0150717d,-0.133284,-0.472375,0,0,-0.005101
ba83aefab5dc,-0.135073,1.894048,0,0,-0.005083
02996f1dd2ea,-0.085533,-0.367201,0,0,-0.005115
7c90be56d2ab,-0.136146,1.117376,0,0,-0.004443


In [11]:
test_price = df.loc[test_idx,:]
test_price.head()

Unnamed: 0_level_0,item_seq_number,image_top_1,price_missing,image_top_1_missing,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6544e41a8817,-0.124879,0.870621,1,0,-0.005135
65b9484d670f,-0.135967,-1.17219,0,1,-0.005101
8bab230b2ecd,-0.134,1.821235,0,0,-0.004888
8e348601fefc,-0.124163,-1.17219,0,1,-0.005074
8bd2fe400b89,-0.134,-0.158875,0,0,-0.005067


In [17]:
emb_cat_max = {}
for c in ['image_top_1_missing']:
    emb_cat_max[c] = max(train_price[c].max(), test_price[c].max())+1
emb_cat_max

{'image_top_1_missing': 2}

In [18]:
train_price.to_csv(f'{feat_dir}/train_numeric.csv', index=True, header=True)
test_price.to_csv(f'{feat_dir}/test_numeric.csv', index=True, header=True)