In [68]:
from __future__ import division
from nltk.util import ngrams

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.linear_model import LinearRegression

%matplotlib inline

In [18]:
def eval_metric(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(eval_metric, greater_is_better=False)

## Evaluation metric: RMSE

In [2]:
# load train and test set
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
X = train[['product_title', 'search_term']]
y = train.relevance

```
1. Preprocessing ( Sanitize strings, correct spelling mistakes )
2. Stemming ( Check to see if this helps )
```

In [55]:
pattern = re.compile(r'\b(' + r'|'.join(text.ENGLISH_STOP_WORDS) + r')\b\s*')

def filter_characters(char):
    return char == '\n' or 32 <= ord(char) <= 126

def sanitize(s):
    s = s.replace('ft.', 'feet')
    s = s.replace('cu.', 'cubic')
    s = s.replace('mm', 'milimeters')
    s = s.replace('oz.', 'ounces')
    s = s.replace('btu', 'british thermal unit')
    s = s.replace('otr', 'over the range')
    s = s.replace('lb.', 'pounds')
    s = s.replace('in.', 'inches')
    s = s.replace('&amp;', 'and')
    s = s.replace('sq.', 'square')
    s = s.replace('gal.', 'gallon')
    
    return s

def preprocess(s):
    s = filter(filter_characters, s)
    s = s.lower()
    s = sanitize(s)
    return pattern.sub('', s)

In [56]:
def generate_ngrams(sentence, ngram_range=3):
    possible_combinations = []
    for n in range(1, ngram_range + 1):
        ngram_tuples = ngrams(sentence.split(), n)
        for gram in ngram_tuples:
            term = ' '.join(gram)
            possible_combinations.append(term)
    return possible_combinations

def average_dice(row):
    search_term = row['search_term']
    product_title = row['product_title']
    
    product_title = product_title.lower()
    
    search_term_combinations = generate_ngrams(search_term, ngram_range=6) # generate possible combinations
    dice_scores = 0 # keep track of dice score
    product_title_count = len(set(product_title.split(' ')))
    
    for term in search_term_combinations:
        term = term.lower()
        term_words = term.split(' ')
        intersection_count = np.array([1 if w in product_title else 0 for w in term_words]).sum(axis=0)
        dice_scores += ((2 * intersection_count) / (product_title_count + len(set(term_words))))
    
    return dice_scores / 6.

In [57]:
X.loc[:, 'product_title'] = X.product_title.map(preprocess)
X.loc[:, 'search_term'] = X.search_term.map(preprocess)
X.loc[:, 'average_dice_score'] = X.apply(average_dice, axis=1)

In [58]:
X.head()

Unnamed: 0,product_title,search_term,average_dice_score
0,simpson strong-tie 12-gauge angle,angle bracket,0.122222
1,simpson strong-tie 12-gauge angle,l bracket,0.122222
2,behr premium textured deckover 1-gallon #sc-14...,deck,0.030303
3,delta vero 1-handle shower faucet trim kit chr...,rain shower head,0.1115
4,delta vero 1-handle shower faucet trim kit chr...,shower faucet,0.116162


## Train Test split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1279)

## Count Vectorizer

In [75]:
train_corpus = X_train.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)
test_corpus = X_test.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)

In [76]:
count_vec = text.CountVectorizer(max_df=.95, min_df=3)
train_corpus = count_vec.fit_transform(train_corpus)
test_corpus = count_vec.transform(test_corpus)

In [77]:
clf = LinearRegression()
clf.fit(train_corpus, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [78]:
train_preds = clf.predict(train_corpus)
test_preds = clf.predict(test_corpus)

In [79]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.419057 
RMSE on test set 0.533445 


## TF-IDF Vectorizer

In [80]:
train_corpus = X_train.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)
test_corpus = X_test.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)

In [81]:
tfidf_vec = text.TfidfVectorizer(max_df=.95, min_df=3)
train_corpus = tfidf_vec.fit_transform(train_corpus)
test_corpus = tfidf_vec.transform(test_corpus)

In [82]:
clf = LinearRegression()
clf.fit(train_corpus, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [83]:
train_preds = clf.predict(train_corpus)
test_preds = clf.predict(test_corpus)

In [84]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.418427 
RMSE on test set 0.534408 
