In [102]:
import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [198]:
train = pd.read_csv('./data/train.csv', index_col='id', encoding="ISO-8859-1")
test = pd.read_csv('./data/test.csv', index_col='id', encoding="ISO-8859-1")

products_description = pd.read_csv('./data/product_descriptions.csv')
products_attributes = pd.read_csv('./data/attributes.csv')

In [199]:
# merge datasets
train_merged = pd.merge(train, products_description, how='left', on='product_uid')
test_merged = pd.merge(test, products_description, how='left', on='product_uid')

In [202]:
train_merged.ix[0].product_description

'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws'

In [226]:
import re

In [219]:
def addPeriod(matchobj):
    matched_group = matchobj.group(0)
    uppercase_index = re.search(r'[A-Z]', matched_group).start()
    return matched_group[:uppercase_index] + '. ' + matched_group[uppercase_index:]
re.sub(r'[a-z]+[A-Z][a-z]+', addPeriod, train_merged.ix[0].product_description)

'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projects. Stronger than angled nailing or screw fastening alone. Help ensure joints are consistently straight and strong. Dimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steel. Galvanized for extra corrosion resistance. Install with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws'

In [221]:
'--'.join(train.search_term)

u"angle bracket--l bracket--deck over--rain shower head--shower only faucet--convection otr--microwave over stove--microwaves--emergency light--mdf 3/4--steele stake--briggs and stratton lawn mower--gas mowe--honda mower--hampton bay chestnut pull up shade--disposer--grill gazebo--door guards--metal plate cover gcfi--radiator grate--windows screens--1x1 rail decorative wood--4*8 beadboard paneling--4x8wood paneling--MDF 4x8--wainscot chair rail--wainscot plank paneling--lawn sprkinler--rainbird sprinkler--PLATFORM FOR WASHERS--samsung front load washer 3.7--upholstery washing machines with steam--CONCRETE & MASONRY CLEANER & ETCHER--concrete for ponds--flexlock for cracks--Belgium block pavers--ourdoor patio tile--insulation roll--6ft h bamboo fencing--balcony privacy screen--bamboo--privacy lattice panels--privacy panels--chalk paint--8 4616809045 9--shelf bracket--white 4shelves--6 teir shelving--hdx wire shelving--kitchen  cabinet finishes--kitchen wire shelf tiered--pantry rack--pl

In [231]:
preprocess_functions = [
    lambda x: x.replace('x', ' times '),
    lambda x: x.replace('/', ' by '),
    lambda x: x.replace("'", ' inches '),
    lambda x: x.replace('in.', ' inches '),
    lambda x: x.replace('ft', ' feet '),
    lambda x: x.replace('btu', ' british thermal unit '),
    lambda x: x.replace('mm', ' milimeters '),
    lambda x: x.replace('cc', ' cubic '),
    lambda x: x.replace('cfm', ' cubic feet per meter '),
    lambda x: x.replace('ga', ' gallons '),
    lambda x: x.replace('lbs', ' pounds '),
    lambda x: x.replace('*', ' times ')
]

transformed_search_terms = []

for transformer_func in preprocess_functions:
    transformed_search_terms.append(train['search_term'].apply(transformer_func))

In [235]:
meta = np.asarray(transformed_search_terms).T

In [236]:
meta

array([[u'angle bracket', u'angle bracket', u'angle bracket', ...,
        u'angle bracket', u'angle bracket', u'angle bracket'],
       [u'l bracket', u'l bracket', u'l bracket', ..., u'l bracket',
        u'l bracket', u'l bracket'],
       [u'deck over', u'deck over', u'deck over', ..., u'deck over',
        u'deck over', u'deck over'],
       ..., 
       [u'schlage lock siena half dummy knob with',
        u'schlage lock siena half dummy knob with',
        u'schlage lock siena half dummy knob with', ...,
        u'schlage lock siena half dummy knob with',
        u'schlage lock siena half dummy knob with',
        u'schlage lock siena half dummy knob with'],
       [u'zen garden  decor', u'zen garden  decor', u'zen garden  decor',
        ..., u'zen  gallons rden  decor', u'zen garden  decor',
        u'zen garden  decor'],
       [u'fine sheer curtain 63 inches', u'fine sheer curtain 63 inches',
        u'fine sheer curtain 63 inches', ...,
        u'fine sheer curtain 63 inches

In [232]:
transformed_search_terms[:50]

[id
 2                                       angle bracket
 3                                           l bracket
 9                                           deck over
 16                                   rain shower head
 17                                 shower only faucet
 18                                     convection otr
 20                               microwave over stove
 21                                         microwaves
 23                                    emergency light
 27                                            mdf 3/4
 34                                       steele stake
 35                     briggs and stratton lawn mower
 37                                           gas mowe
 38                                        honda mower
 48                 hampton bay chestnut pull up shade
 51                                           disposer
 65                                       grill gazebo
 69                                        door guards
 75   

In [233]:
train.search_term[:50]

id
2                               angle bracket
3                                   l bracket
9                                   deck over
16                           rain shower head
17                         shower only faucet
18                             convection otr
20                       microwave over stove
21                                 microwaves
23                            emergency light
27                                    mdf 3/4
34                               steele stake
35             briggs and stratton lawn mower
37                                   gas mowe
38                                honda mower
48         hampton bay chestnut pull up shade
51                                   disposer
65                               grill gazebo
69                                door guards
75                     metal plate cover gcfi
81                             radiator grate
85                            windows screens
88                   1x1 rail d

In [229]:
re.findall(r'[0-9]/[0-9]', 'mdf 3/4')

['3/4']

In [230]:
'mdf 3/4'.replace('/', ' by ')

'mdf 3 by 4'