In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [3]:
stemmer = SnowballStemmer('english')

df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('attributes.csv')
df_product_descrition = pd.read_csv('product_descriptions.csv')


In [5]:
print df_train.shape
print df_test.shape
print df_attr.shape
print df_product_descrition.shape
#Need to check NA values
df_train.head(5)
df_train.relevance.unique().shape

(74067, 5)
(166693, 4)
(2044803, 3)
(124428, 2)


(13,)

In [8]:
df_test.head(5)

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


In [9]:
unique_attr = df_attr.name.unique().size
print unique_attr

5411


In [10]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print df_all.shape

(240760, 5)


In [11]:
df_all.head(n=5)

Unnamed: 0,id,product_title,product_uid,relevance,search_term
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet


In [12]:
df_product_descrition.head(n=5)

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [13]:
df_all = pd.merge(df_all, df_product_descrition, how='left', on='product_uid')

In [14]:
df_all.head(n=5)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...


In [15]:
df_all.head(n=5)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...


In [16]:
cachedStopWords = stopwords.words("english")
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])
def remove_stop_words(s):
    return " ".join(word for word in s.split() if word not in cachedStopWords)

In [17]:
#stemming
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [18]:
#remove stop words
df_all['search_term'] = df_all['search_term'].map(lambda x:remove_stop_words(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:remove_stop_words(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:remove_stop_words(x))

In [19]:
#creating features
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

In [20]:
df_all.head(n=5)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,len_of_query,product_info
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"onli angl make joint stronger, also provid con...",2,angl bracket\tsimpson strong-ti 12-gaug angl\t...
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"onli angl make joint stronger, also provid con...",2,l bracket\tsimpson strong-ti 12-gaug angl\tonl...
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck,behr premium textur deckov innov solid color c...,1,deck\tbehr premium textur deckov 1-gal. #sc-14...
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat bathroom delta vero single-handl shower ...,3,rain shower head\tdelta vero 1-handl shower on...
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat bathroom delta vero single-handl shower ...,3,shower onli faucet\tdelta vero 1-handl shower ...


In [21]:
def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

In [22]:
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [23]:
df_all.head(n=5)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,len_of_query,product_info,word_in_title,word_in_description
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"onli angl make joint stronger, also provid con...",2,angl bracket\tsimpson strong-ti 12-gaug angl\t...,1,1
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"onli angl make joint stronger, also provid con...",2,l bracket\tsimpson strong-ti 12-gaug angl\tonl...,1,1
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck,behr premium textur deckov innov solid color c...,1,deck\tbehr premium textur deckov 1-gal. #sc-14...,1,1
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat bathroom delta vero single-handl shower ...,3,rain shower head\tdelta vero 1-handl shower on...,1,1
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat bathroom delta vero single-handl shower ...,3,shower onli faucet\tdelta vero 1-handl shower ...,3,2


In [24]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

In [25]:
df_all.head(n=5)

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,2,1,1
1,3,100001,2.5,2,1,1
2,9,100002,3.0,1,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2


In [26]:
df_train = df_all.iloc[:74067]
df_test = df_all.iloc[74067:]
id_test = df_test['id']

In [27]:
y_train = df_train['relevance'].values

In [28]:
x_train = df_train.drop(['id','relevance'],axis=1).values
x_test = df_test.drop(['id','relevance'],axis=1).values

In [29]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(x_train, y_train)

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=15, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.1, n_estimators=45, n_jobs=1, oob_score=False,
         random_state=25, verbose=0, warm_start=False)

In [30]:
y_pred = clf.predict(x_test)

In [31]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)