In [292]:
from __future__ import division
from nltk.util import ngrams
from collections import Counter
from nltk.stem.porter import PorterStemmer
from difflib import SequenceMatcher as seq_matcher

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

%matplotlib inline

In [18]:
def eval_metric(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(eval_metric, greater_is_better=False)

In [86]:
# pattern to remove stopwords
pattern = re.compile(r'\b(' + r'|'.join(text.ENGLISH_STOP_WORDS) + r')\b\s*')

In [166]:
# instantiate porter stemmer
stemmer = PorterStemmer()

## Evaluation metric: RMSE

In [144]:
# load train and test set
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Query Expansion

In [109]:
def query_expansion(train, test, y, top_words=10):
    queries_ext_train = np.zeros(len(train)).astype(np.object)
    queries_ext_test = np.zeros(len(test)).astype(np.object)

    queries = train.search_term
    queries_test = test.search_term
    
    titles = train.product_title
    titles_test = test.product_title
    
    for q in np.unique(queries):
        q_mask = (queries == q).values
        q_test = (queries_test == q).values

        titles_q = titles[q_mask]
        y_q = y[q_mask]

        good_mask = (y_q > 2.).values
        titles_good = titles_q[good_mask]

        ext_q = str(q)

        for item in titles_good:
            ext_q += ' ' + str(item)
        
        ext_q = pattern.sub('', ext_q)
        c = [word for word, it in Counter(ext_q.split()).most_common(top_words)]
        c = ' '.join(c)

        queries_ext_train[q_mask] = c
        queries_ext_test[q_test] = c
    
    return queries_ext_train, queries_ext_test

```
1. Preprocessing ( Sanitize strings, correct spelling mistakes )
2. Stemming ( Check to see if this helps )
```

In [188]:
def filter_characters(char):
    return char == '\n' or 32 <= ord(char) <= 126

def sanitize(s):
    s = s.replace('ft.', 'feet')
    s = s.replace('cu.', 'cubic')
    s = s.replace('mm', 'milimeters')
    s = s.replace('oz.', 'ounces')
    s = s.replace('btu', 'british thermal unit')
    s = s.replace('otr', 'over the range')
    s = s.replace('lb.', 'pounds')
    s = s.replace('in.', 'inches')
    s = s.replace('&amp;', 'and')
    s = s.replace('sq.', 'square')
    s = s.replace('gal.', 'gallon')
    
    return s

def preprocess(s):
    s = filter(filter_characters, s)
    s = s.lower()
    s = sanitize(s)
    
    return pattern.sub('', s)

In [148]:
def create_set(x):
    x = x.lower() # lowecase string
    x = x.split(' ') # split on empty space will work on clever ways to do this later
    return set(x)

def Dice(row):
    product_title = row['product_title']
    search_term = row['search_term']
    
    product_title = create_set(product_title)
    search_term = create_set(search_term)

    return (2 * len(product_title & search_term)) / (len(product_title) + len(search_term))

def Jaccard(row):
    product_title = row['product_title']
    search_term = row['search_term']
    
    product_title = create_set(product_title)
    search_term = create_set(search_term)
    
    return len(product_title & search_term) / len(product_title | search_term)

def Overlap(row):
    product_title = row['product_title']
    search_term = row['search_term']
    
    product_title = create_set(product_title)
    search_term = create_set(search_term)
    
    return len(product_title & search_term) / min(len(product_title), len(search_term))

## Feature Engineering

In [185]:
def num_words_in_query(query):
    return len(query.split())

def num_words_in_title(title):
    return len(title.split())

def query_title_overlap(row):
    query = row['search_term']
    title = row['product_title']
    query_words = query.split()
    
    count_overlap = 0
    for word in query_words:
        if query in title:
            count_overlap += 1
    
    return count_overlap

def levenshtein_distance(row):
    query = row['search_term']
    title = row['product_title']
    
    return 1 - seq_matcher(None, query, title).ratio()

## Simple Model Based on Average Dice score

In [198]:
X = train[['product_title', 'search_term']]
y = train.relevance

In [199]:
X.loc[:, 'product_title'] = X.product_title.map(preprocess)
X.loc[:, 'search_term'] = X.search_term.map(preprocess)

In [200]:
test.loc[:, 'product_title'] = test.product_title.map(preprocess)
test.loc[:, 'search_term'] = test.search_term.map(preprocess)

In [201]:
## calculate average dice score
X.loc[:, 'dice_score'] = X.apply(Dice, axis=1)
test.loc[:, 'dice_score'] = test.apply(Dice, axis=1)

In [202]:
## calculate jaccard distance
X.loc[:, 'jaccard_distance'] = X.apply(Jaccard, axis=1)
test.loc[:, 'jaccard_distance'] = test.apply(Jaccard, axis=1)

In [203]:
## calculate overlap
X.loc[:, 'overlap'] = X.apply(Overlap, axis=1)
test.loc[:, 'overlap'] = test.apply(Overlap, axis=1)

In [204]:
## features
X.loc[:, 'num_words_in_query'] = X.search_term.map(num_words_in_query)
X.loc[:, 'num_words_in_title'] = X.product_title.map(num_words_in_title)
X.loc[:, 'query_title_overlap'] = X.apply(query_title_overlap, axis=1)
X.loc[:, 'one-edit-distance'] = X.apply(levenshtein_distance, axis=1)

## Train Test split

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1279)

## Linear Model

In [209]:
features = ['dice_score', 'jaccard_distance', 'overlap', 'num_words_in_query', \
            'num_words_in_title', 'query_title_overlap', 'one-edit-distance']

In [305]:
clf = LinearRegression()
etr = ExtraTreesRegressor(n_estimators=500, max_depth=7, min_samples_leaf=3, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=250, max_depth=10, min_samples_leaf=3, min_samples_split=3, n_jobs=-1)
gbr = GradientBoostingRegressor()
xgbr = XGBRegressor(n_estimators=100, learning_rate=0.3, subsample=0.9)

In [306]:
# cross validation
scores = cross_val_score(xgbr, X_train[features], y_train, scoring=rmse_scorer, cv=5, n_jobs=-1)

In [307]:
print 'Average score %f and standard deviation %f ' %(np.mean(scores), np.std(scores))

Average score -0.495223 and standard deviation 0.003914 


In [308]:
xgbr.fit(X_train[features], y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9)

In [309]:
train_preds = xgbr.predict(X_train[features])
test_preds = xgbr.predict(X_test[features])

In [310]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.489262 
RMSE on test set 0.493011 


In [134]:
clf.fit(X[['dice_score']], y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [135]:
predictions = clf.predict(test[['dice_score']])

In [139]:
def cap_predictions(pred):
    if pred > 3.0:
        return 3.0
    elif pred < 1.0:
        return 1.0
    else:
        return pred
predictions = map(cap_predictions, predictions)

In [142]:
submissions = pd.read_csv('../data/sample_submission.csv')
submissions.loc[:, 'relevance'] = predictions

In [143]:
submissions.to_csv('../submissions/linear_model.csv', index=False)

## Count Vectorizer

In [75]:
train_corpus = X_train.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)
test_corpus = X_test.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)

In [76]:
count_vec = text.CountVectorizer(max_df=.95, min_df=3)
train_corpus = count_vec.fit_transform(train_corpus)
test_corpus = count_vec.transform(test_corpus)

In [77]:
clf = LinearRegression()
clf.fit(train_corpus, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [78]:
train_preds = clf.predict(train_corpus)
test_preds = clf.predict(test_corpus)

In [79]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.419057 
RMSE on test set 0.533445 


## TF-IDF Vectorizer

In [80]:
train_corpus = X_train.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)
test_corpus = X_test.apply(lambda x: '%s %s' %(x['search_term'], x['product_title']), axis=1)

In [81]:
tfidf_vec = text.TfidfVectorizer(max_df=.95, min_df=3)
train_corpus = tfidf_vec.fit_transform(train_corpus)
test_corpus = tfidf_vec.transform(test_corpus)

In [82]:
clf = LinearRegression()
clf.fit(train_corpus, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [83]:
train_preds = clf.predict(train_corpus)
test_preds = clf.predict(test_corpus)

In [84]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.418427 
RMSE on test set 0.534408 


## Query Expansion

In [90]:
expanded_query_train, expanded_query_test = query_expansion(train, test, train.relevance)

In [92]:
train.loc[:, 'query'] = expanded_query_train

In [101]:
X = train[['product_title', 'query']]
y = train.relevance

In [102]:
X.loc[:, 'product_title'] = X.product_title.map(preprocess)
X.loc[:, 'query'] = X['query'].map(preprocess)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=44)

In [104]:
train_corpus = X_train.apply(lambda x: '%s %s' %(x['query'], x['product_title']), axis=1)
test_corpus = X_test.apply(lambda x: '%s %s' %(x['query'], x['product_title']), axis=1)

In [105]:
tfidf_vec = text.TfidfVectorizer(max_df=.95, min_df=3)
train_corpus = tfidf_vec.fit_transform(train_corpus)
test_corpus = tfidf_vec.transform(test_corpus)

In [106]:
clf = LinearRegression()
clf.fit(train_corpus, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [107]:
train_preds = clf.predict(train_corpus)
test_preds = clf.predict(test_corpus)

In [108]:
print 'RMSE on training set %f ' %(eval_metric(y_train, train_preds))
print 'RMSE on test set %f ' %(eval_metric(y_test, test_preds))

RMSE on training set 0.413295 
RMSE on test set 0.555150 
