### Predict search relevance by count words

adopted from https://www.kaggle.com/omarelgabry/predict-search-relevance-by-count-words/notebook

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
import xgboost as xgb

In [4]:
train_df = pd.read_csv('../resources/train.csv', encoding='ISO-8859-1')
descriptions_df = pd.read_csv('../resources/product_descriptions.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('../resources/test.csv', encoding='ISO-8859-1')

In [5]:
train_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [6]:
descriptions_df.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [7]:
test_df.head()

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


### Feature Engineering

Convert product_title and product_description to count_words_in_title and count_words_in_description denoting the number of times a word in search_term appeared respectively. So, instead of dealing with strings, we have numbers that indicate the relevance between the search_term and product_title and product_description

In [8]:
# Step 1: Merge descriptions dataframe
train_df = pd.merge(train_df, descriptions_df, how='left', on='product_uid')
test_df = pd.merge(test_df, descriptions_df, how='left', on='product_uid')

In [9]:
train_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [10]:
test_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket,"Not only do angles make joints stronger, they ..."
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets,"Not only do angles make joints stronger, they ..."
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able,"Not only do angles make joints stronger, they ..."
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties,"Not only do angles make joints stronger, they ..."
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668,"Not only do angles make joints stronger, they ..."


In [13]:
# Step 2: Normalize the title and description using stemming algorithms, eg, SnowballStemmer
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def str_stemmer(s):
    return ' '.join([stemmer.stem(word) for word in s.lower().split()])

train_df['search_term'] = train_df['search_term'].apply(str_stemmer)
train_df['product_title'] = train_df['product_title'].apply(str_stemmer)
train_df['product_description'] = train_df['product_description'].apply(str_stemmer)

test_df['search_term'] = test_df['search_term'].apply(str_stemmer)
test_df['product_title'] = test_df['product_title'].apply(str_stemmer)
test_df['product_description'] = test_df['product_description'].apply(str_stemmer)

In [14]:
train_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,"not onli do angl make joint stronger, they als..."
1,3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,"not onli do angl make joint stronger, they als..."
2,9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck over,3.0,behr premium textur deckov is an innov solid c...
3,16,100005,delta vero 1-handl shower on faucet trim kit i...,rain shower head,2.33,updat your bathroom with the delta vero single...
4,17,100005,delta vero 1-handl shower on faucet trim kit i...,shower on faucet,2.67,updat your bathroom with the delta vero single...


In [15]:
test_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description
0,1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,"not onli do angl make joint stronger, they als..."
1,4,100001,simpson strong-ti 12-gaug angl,metal l bracket,"not onli do angl make joint stronger, they als..."
2,5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,"not onli do angl make joint stronger, they als..."
3,6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,"not onli do angl make joint stronger, they als..."
4,7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,"not onli do angl make joint stronger, they als..."


In [19]:
# Step 3: Count how many times each word in search_term appeared in product_title and product_description

def count_words(strs):
    str_words, str_search = strs
    return sum(int(str_search.find(word) >= 0) for word in str_words.split())

train_df['count_words_in_title'] = train_df[['product_title', 'search_term']].apply(count_words, axis=1)
train_df['count_words_in_description'] = train_df[['product_description', 'search_term']].apply(count_words, axis=1)

test_df['count_words_in_title'] = test_df[['product_title', 'search_term']].apply(count_words, axis=1)
test_df['count_words_in_description'] = test_df[['product_description', 'search_term']].apply(count_words, axis=1)

In [20]:
train_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,count_words_in_title,count_words_in_description
0,2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,"not onli do angl make joint stronger, they als...",1,6
1,3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,"not onli do angl make joint stronger, they als...",0,3
2,9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck over,3.0,behr premium textur deckov is an innov solid c...,0,0
3,16,100005,delta vero 1-handl shower on faucet trim kit i...,rain shower head,2.33,updat your bathroom with the delta vero single...,2,7
4,17,100005,delta vero 1-handl shower on faucet trim kit i...,shower on faucet,2.67,updat your bathroom with the delta vero single...,3,6


In [21]:
test_df.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description,count_words_in_title,count_words_in_description
0,1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,"not onli do angl make joint stronger, they als...",0,4
1,4,100001,simpson strong-ti 12-gaug angl,metal l bracket,"not onli do angl make joint stronger, they als...",0,3
2,5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,"not onli do angl make joint stronger, they als...",1,4
3,6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,"not onli do angl make joint stronger, they als...",1,1
4,7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,"not onli do angl make joint stronger, they als...",1,1


In [22]:
# Step 4: Drop unnecessary colulmns
train_df.drop(['product_title', 'product_description', 'search_term'], inplace=True, axis=1)
test_df.drop(['product_title', 'product_description', 'search_term'], inplace=True, axis=1)

In [23]:
train_df.head()

Unnamed: 0,id,product_uid,relevance,count_words_in_title,count_words_in_description
0,2,100001,3.0,1,6
1,3,100001,2.5,0,3
2,9,100002,3.0,0,0
3,16,100005,2.33,2,7
4,17,100005,2.67,3,6


In [24]:
test_df.head()

Unnamed: 0,id,product_uid,count_words_in_title,count_words_in_description
0,1,100001,0,4
1,4,100001,0,3
2,5,100001,1,4
3,6,100001,1,1
4,7,100001,1,1


### Prepare train and test dataset

In [75]:
from sklearn.model_selection import train_test_split
X = train_df.drop(['id', 'relevance'], axis=1).copy()
y = train_df['relevance']
# X_test = test_df.drop('id', axis=1).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111)

In [76]:
X_train.head()

Unnamed: 0,product_uid,count_words_in_title,count_words_in_description
56570,168717,1,9
39832,141294,3,8
11168,107818,0,2
34989,134505,2,6
44967,148828,1,3


In [77]:
y_train.head()

56570    1.67
39832    3.00
11168    2.33
34989    2.67
44967    3.00
Name: relevance, dtype: float64

In [78]:
X_test.head()

Unnamed: 0,product_uid,count_words_in_title,count_words_in_description
68460,193280,3,12
68072,192447,2,26
44520,148172,2,6
40774,142612,2,2
24282,121293,1,2


In [79]:
from sklearn.metrics import mean_squared_error

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

### Train with Linear Regression

In [80]:
lreg = LinearRegression()
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)
lreg_score = lreg.score(X_train, y_train)

In [81]:
fmean_squared_error(y_test, y_pred)

0.515326920349787

### Train with Xgboost

In [82]:
### Train with Xgboost
params = {'objective': 'reg:linear', 'max_depth': 10}
X_train_xgb = xgb.DMatrix(X_train, y_train)
X_test_xgb = xgb.DMatrix(X_test)

gbm = xgb.train(params, X_train_xgb, 20)
y_pred = gbm.predict(X_test_xgb)


In [83]:
fmean_squared_error(y_test, y_pred)

0.511492884138147

### Train with Random Forest

In [87]:
rfr = RandomForestRegressor(n_estimators=535, n_jobs=-1, random_state=2016, verbose=1)
rfr.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 535 out of 535 | elapsed:   14.7s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=535, n_jobs=-1,
           oob_score=False, random_state=2016, verbose=1, warm_start=False)

In [88]:
y_pred = rfr.predict(X_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 535 out of 535 | elapsed:    1.2s finished


In [89]:
fmean_squared_error(y_test, y_pred)

0.5678419707752179