In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
dataset['category_name'] = dataset['category_name'].fillna('Other').astype(str)
dataset['brand_name'] = dataset['brand_name'].fillna('missing').astype(str)
dataset['shipping'] = dataset['shipping'].astype(str)  # makes this categorical
dataset['item_condition_id'] = dataset['item_condition_id'].astype(str)
dataset['item_description'] = dataset['item_description'].fillna('None')

X = dataset.loc[:, dataset.columns != 'price']
Y = dataset.price

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [7]:
X_train.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
604635,604635,Enzo,3,Women/Shoes/Mules & Clogs,missing,0,Enzo Angiolini Mules/Clogs. Super Cute with so...
496799,496799,Black dress,3,Women/Dresses/Full-Length,missing,1,Xl long black dress Solid under with lace over...
1035231,1035231,2 items for Brittany,1,Electronics/Cell Phones & Accessories/Cables &...,missing,0,- Urban Decay Eyeshadow This has never been us...
628659,628659,Texas budle,2,Women/Tops & Blouses/T-Shirts,missing,0,Very cute!! No flaws
261459,261459,North face rain coat,3,Women/Athletic Apparel/Jackets,The North Face,0,Sea foam green rain coat Size 18 XL in youth I...
959361,959361,Michael kors crossbody bag,3,Women/Women's Handbags/Messenger & Crossbody,Michael Kors,0,"Used , the color is rose gold"
199415,199415,Cream studded wedges,3,Women/Shoes/Sandals,missing,0,The brand is Hot Rated purchased at Buckle. I ...
1460850,1460850,Pink and gold trinket box,3,Home/Home Décor/Home Décor Accents,missing,0,"5.75"" tall and 4.5"" wide. Heavy resin. The fel..."
568899,568899,True religion button shirt slimsize XXXL,3,Men/Tops/T-shirts,True Religion Brand Jeans,1,Please remember the brand name usually run sma...
1365752,1365752,WD Scorpio Blue 1TB 2.5 hard drive,3,"Electronics/Computers & Tablets/Drives, Storag...",Western Digital,1,Used like new condition Can be used on PS4 (yo...


In [None]:
%%time

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

preprocessor = CountVectorizer().build_preprocessor()


def build_field_preprocessor(field):
    field_idx = list(X_train.columns).index(field)
    return lambda x: preprocessor(x[field_idx])  # this preprocesses like stripping accents, etc.


vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=50_000,
        preprocessor=build_field_preprocessor('name')
    )),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_field_preprocessor('category_name')
    )),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_field_preprocessor('brand_name')
    )),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_field_preprocessor('shipping')
    )),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_field_preprocessor('item_condition_id')
    )),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=1_000_00,
        preprocessor=build_field_preprocessor('item_description')
    ))
], n_jobs=16)

X_train_transformed = vectorizer.fit_transform(X_train.values)
X_train_transformed