In [864]:
import time
start_time = time.time()

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn import pipeline, grid_search
from sklearn.metrics import mean_squared_error, make_scorer
from nltk.stem.porter import *
import re
import random
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
#stemmer = PorterStemmer()

e_strip_punc = re.compile(r"[^a-zA-z0-9]+")
e_split_words = re.compile(r"(\s[a-z]+)([A-Z][a-z]+)")

random.seed(2016)

#load datasets
df_train = pd.read_csv('./input/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('./input/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('./input/attributes.csv')
df_pro_desc = pd.read_csv('./input/product_descriptions.csv')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
#df_material = df_attr[df_attr.name == "Material"][["product_uid", "value"]].rename(columns={"value": "material"})

num_train = df_train.shape[0]

In [865]:
def str_stemmer(s):
    s = s.replace("'","in.") # character
    s = s.replace("inches","in.") # whole word
    s = s.replace("inch","in.") # whole word
    s = s.replace(" in ","in. ") # no period
    s = s.replace(" in.","in.") # prefix space

    s = s.replace("''","ft.") # character
    s = s.replace(" feet ","ft. ") # whole word
    s = s.replace("feet","ft.") # whole word
    s = s.replace("foot","ft.") # whole word
    s = s.replace(" ft ","ft. ") # no period
    s = s.replace(" ft.","ft.") # prefix space

    s = s.replace(" pounds ","lb. ") # character
    s = s.replace(" pound ","lb. ") # whole word
    s = s.replace("pound","lb.") # whole word
    s = s.replace(" lb ","lb. ") # no period
    s = s.replace(" lb.","lb.") 
    s = s.replace(" lbs ","lb. ") 
    s = s.replace("lbs.","lb.") 

    s = s.replace("*"," xby ")
    s = s.replace(" by"," xby")
    s = s.replace("x0"," xby 0")
    s = s.replace("x1"," xby 1")
    s = s.replace("x2"," xby 2")
    s = s.replace("x3"," xby 3")
    s = s.replace("x4"," xby 4")
    s = s.replace("x5"," xby 5")
    s = s.replace("x6"," xby 6")
    s = s.replace("x7"," xby 7")
    s = s.replace("x8"," xby 8")
    s = s.replace("x9"," xby 9")

    s = s.replace(" sq ft","sq.ft. ") 
    s = s.replace("sq ft","sq.ft. ")
    s = s.replace("sqft","sq.ft. ")
    s = s.replace(" sqft ","sq.ft. ") 
    s = s.replace("sq. ft","sq.ft. ") 
    s = s.replace("sq ft.","sq.ft. ") 
    s = s.replace("sq feet","sq.ft. ") 
    s = s.replace("square feet","sq.ft. ") 

    s = s.replace(" gallons ","gal. ") # character
    s = s.replace(" gallon ","gal. ") # whole word
    s = s.replace("gallons","gal.") # character
    s = s.replace("gallon","gal.") # whole word
    s = s.replace(" gal ","gal. ") # character
    s = s.replace(" gal","gal") # whole word

    s = s.replace(" ounces","oz.")
    s = s.replace(" ounce","oz.")
    s = s.replace("ounce","oz.")
    s = s.replace(" oz ","oz. ")

    s = s.replace(" centimeters","cm.")    
    s = s.replace(" cm.","cm.")
    s = s.replace(" cm ","cm. ")

    s = s.replace(" milimeters","mm.")
    s = s.replace(" mm.","mm.")
    s = s.replace(" mm ","mm. ")
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

#improved common word matching
def str_common_word_2(str1, str2):
    words, cnt = str1.split(),0
    for word in words:
        if len(word)>1:
            if str2.find(word)>=0:
                cnt+=1
    return cnt

def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

#calculating n_grams
def n_grams_match(x, query, text, n_grams):
    q = x[query]
    t = x[text]
    c = 0
    for i in range(len(query) - n_grams + 1):
        sq = q[i:i+n_grams]
        c += t.count(sq)
    return c / (len(text) + len(query))

def n_gram_features(df_all):
    print("Calculating n-gram features")
    for i in range(3, 6):
        print("Starting n-grams", i)
        #df_all['n_grams_clean_{0}'.format(i)] = df_all.apply(n_grams_match, axis=1, query = 'clean_term', text='clean_brand', n_grams = i)
        #df_all['n_grams_stemmed_{0}'.format(i)] = df_all.apply(n_grams_match, axis=1, query = 'search_term', text='brand', n_grams = i)
        df_all['n_grams_clean_title_{0}'.format(i)] = df_all.apply(n_grams_match, axis=1, query = 'clean_term', text='clean_title', n_grams = i)
        df_all['n_grams_stemmed_title_{0}'.format(i)] = df_all.apply(n_grams_match, axis=1, query = 'search_term', text='product_title', n_grams = i)
    print("n-grams completed.")

#cleaning the text from unwanted punctuations and unecessary numbers
def clean_text(d):
    no_punc = e_strip_punc.sub(" ", d)
    words_split = e_split_words.sub(r"\1 \2", no_punc)
    return words_split.lower()

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

RMSE = make_scorer(fmean_squared_error, greater_is_better=False)

In [866]:
#merge testing and training datasets together
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

In [867]:
#CLEANING PROCESS 
df_all['clean_title'] = df_all['product_title'].map(lambda x:clean_text(x))
df_all['clean_term'] = df_all['search_term'].map(lambda x:clean_text(x))
#df_all['clean_brand'] = df_all['brand'].map(lambda x:clean_text(str(x)))

In [868]:
#STEMMING PROCESS
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(str(x)))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(str(x)))
df_all['brand'] = df_all['brand'].map(lambda x:str_stemmer(str(x)))

In [869]:
#NGRAMS CALCULATION
n_gram_features(df_all)

Calculating n-gram features
Starting n-grams 3
Starting n-grams 4
Starting n-grams 5
n-grams completed.


In [870]:
#IF STEMMER CHANGES NEED TO BE REPRODUCED AGAIN!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#takes a while to process
#switch (False if no pickle exist, True if the pickle exist already)
file=True

if not file:
    print ("There is >>> NOT << a pickle file for this feature!")
    df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
    
    #function to write out big features (feature that take long to process)
    import pickle as pickle
    feature_name = "product_description"
    pickle_file = feature_name + ".pickle"
    def pickle_feature(feature):
        try:
            f = open(pickle_file, 'wb')
            save = {
                feature_name: feature,
            }
            pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
            f.close()
        except Exception as e:
            print ("unable to save data to", pickle_file, ':', e)
            raise
    pickle_feature(df_all[feature_name])
    
else:
    file = open('product_description.pickle','rb')
    print ("There is a pickle file for this feature!")
    data = pickle.load(file)
    df_all['product_description'] = data['product_description']
    del data    

There is a pickle file for this feature!


In [871]:
#length of query as a feature
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)
#df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)

df_all['len_of_query_in_chars'] = df_all['search_term'].map(lambda x:len(x)).astype(np.int64)
df_all['len_of_title_in_chars'] = df_all['product_title'].map(lambda x:len(x)).astype(np.int64)
df_all['len_of_brand_in_chars'] =  df_all['brand'].map(lambda x:len(x)).astype(np.int64)
#df_all['len_of_description_in_chars'] = df_all['product_description'].map(lambda x:len(x)).astype(np.int64)

In [872]:
#another feature with the combination of all the following columns (search term, product title, description)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']+"\t"+df_all['brand']

#common words as features
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word_2(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word_2(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_brand'] = df_all['product_info'].map(lambda x:str_common_word_2(x.split('\t')[0],x.split('\t')[3]))


#these two features check exact whole str matches
df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

#how much of the brand is in the title/description
#df_all['brand_in_query'] =  df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[3],x.split('\t')[0],0))
#df_all['brand_in_title'] =  df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[3],x.split('\t')[1],0))
#df_all['brand_in_description'] =  df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[3],x.split('\t')[2],0))


#these two features (how much of the search query is in title && how much of the SQ is in description)
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand'] #ERROR (replace lob by loq)

#ratio in terms of chars
df_all['ratio_title_to_chars'] = df_all['len_of_title_in_chars'] / df_all['len_of_query_in_chars'] 
df_all['ratio_title_query_words'] = df_all['len_of_query']/df_all['len_of_title']

#actual length of features
df_all['actual_lenght_of_query'] = df_all['len_of_query'] / df_all['len_of_query_in_chars']
df_all['actual_lenght_of_title'] = df_all['len_of_title'] / df_all['len_of_title_in_chars']

In [873]:
#BRAND RELATED FEATURES

#df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']
#df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word_2(x.split('\t')[0],x.split('\t')[1]))
#df_all['query_in_brand'] = df_all['attr'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
#df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']
#df_all['ratio_brand_query_chars'] = df_all['len_of_query_in_chars']/df_all['len_of_brand_in_chars']
df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x:d[x])

In [874]:
#Basically get rids of all the content (text) features...Can be a drawback since we lost all the context feature. 
df_all = df_all.drop(['brand','search_term','product_title','product_description','product_info','clean_term','clean_title'],axis=1)

In [875]:
#pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 30
print (df_all[:5])

   id  product_uid  relevance  n_grams_clean_title_3  n_grams_stemmed_title_3  \
0   2       100001       3.00               0.142857                 0.083333   
1   3       100001       2.50               0.000000                 0.083333   
2   9       100002       3.00               0.238095                 0.250000   
3  16       100005       2.33               0.238095                 0.208333   
4  17       100005       2.67               0.380952                 0.375000   

   n_grams_clean_title_4  n_grams_stemmed_title_4  n_grams_clean_title_5  \
0               0.095238                 0.041667               0.047619   
1               0.000000                 0.000000               0.000000   
2               0.142857                 0.041667               0.047619   
3               0.142857                 0.166667               0.095238   
4               0.333333                 0.333333               0.285714   

   n_grams_stemmed_title_5  len_of_query  len_of_title  

In [876]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
#creates an array of only values (drops the headers and columns values provided)
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [877]:
#using random regressor
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0, criterion='mse')
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print (clf.score(X_train, y_train)**0.5)
#RMSE = mean_squared_error(X_test,y_pred)**5
#print (RMSE)

0.459128846888


In [878]:
pd.DataFrame({"id": id_test, "relevance":y_pred}).to_csv('submission_v15.csv', index=False)

In [879]:
'''
#using the classifier from the rf_mean_square_error / code online
rfr = RandomForestRegressor(n_jobs = -1)
clf = pipeline.Pipeline([('rfr', rfr)])
param_grid = {'rfr__n_estimators':[400], 'rfr__max_depth':[13]}
#param_grid = {}
#param_grid = {'rfr__n_estimators':list(range(34,50,1)), 'rfr__max_depth':list(range(13,15,1))}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 10, verbose = 0, scoring=RMSE)
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

y_pred = model.predict(X_test)
print(len(y_pred))
'''

'\n#using the classifier from the rf_mean_square_error / code online\nrfr = RandomForestRegressor(n_jobs = -1)\nclf = pipeline.Pipeline([(\'rfr\', rfr)])\nparam_grid = {\'rfr__n_estimators\':[400], \'rfr__max_depth\':[13]}\n#param_grid = {}\n#param_grid = {\'rfr__n_estimators\':list(range(34,50,1)), \'rfr__max_depth\':list(range(13,15,1))}\nmodel = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 10, verbose = 0, scoring=RMSE)\nmodel.fit(X_train, y_train)\n\nprint("Best parameters found by grid search:")\nprint(model.best_params_)\nprint("Best CV score:")\nprint(model.best_score_)\n\ny_pred = model.predict(X_test)\nprint(len(y_pred))\n'