In [17]:
import numpy as np 
import pandas as pd
import os
import gc
import re

# feature libraries
from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing

In [18]:
data_dir = '../input/kaggle_data'
feat_dir = '../input/features'

In [27]:
used_cols = ['item_id','region','city','parent_category_name','category_name', 
             'user_type','param_1', 'param_2','param_3']

train = pd.read_csv(f'{data_dir}/train.csv', index_col="item_id", usecols=used_cols)
test = pd.read_csv(f'{data_dir}/test.csv', index_col="item_id", usecols=used_cols)

train_idx = train.index
test_idx = test.index

df = pd.concat([train, test], axis=0)

del train, test
gc.collect()

35

In [28]:
# identify missing values
df.columns[df.isna().any()].tolist()

['param_1', 'param_2', 'param_3']

In [29]:
for c in ['param_1', 'param_2', 'param_3']:
    df[c].fillna(value='missing', inplace=True)
    
# re-label categorical data
cat_cols = ['region','city','parent_category_name','category_name','user_type','param_1','param_2','param_3']

for cols in cat_cols:
    df[cols] = df[cols].astype(str)
    df[cols] = df[cols].str.lower()
    df[cols] = df[cols].replace(to_replace=' +', value=' ', regex=True)
    
lbl = preprocessing.LabelEncoder()
for col in cat_cols:
    df[col] = lbl.fit_transform(df[col])


In [30]:
feat_cols = list(df.columns)[0:]
feat_cols

['region',
 'city',
 'parent_category_name',
 'category_name',
 'param_1',
 'param_2',
 'param_3',
 'user_type']

In [31]:
df.columns[df.isna().any()].tolist()

[]

In [32]:
train_le = df.loc[train_idx,:]
train_le.head()

Unnamed: 0_level_0,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
b912c3c6a6ad,19,462,4,42,249,78,775,1
2dac0150717d,17,1314,2,22,122,78,775,1
ba83aefab5dc,16,1290,0,2,83,78,775,1
02996f1dd2ea,21,950,4,42,38,78,775,0
7c90be56d2ab,4,318,6,0,278,124,46,1


In [33]:
test_le = df.loc[test_idx,:]
test_le.head()

Unnamed: 0_level_0,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6544e41a8817,4,318,4,10,110,198,74,1
65b9484d670f,19,993,8,5,119,78,775,1
8bab230b2ecd,12,151,0,2,318,78,775,1
8e348601fefc,18,1319,2,4,108,131,775,1
8bd2fe400b89,14,243,4,42,102,78,775,1


In [35]:
train_le.to_csv(f'{feat_dir}/train_le.csv', index=True, header=True)
test_le.to_csv(f'{feat_dir}/test_le.csv', index=True, header=True)