In [46]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression
from nltk.stem.snowball import SnowballStemmer

In [47]:
stemmer = SnowballStemmer('english')

In [48]:
df_train = pd.read_csv('train.csv', encoding="latin-1")
df_test = pd.read_csv('test.csv', encoding="latin-1")
df_attr = pd.read_csv('attributes.csv')
df_pro_desc = pd.read_csv('product_descriptions.csv')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

num_train = df_train.shape[0]

In [49]:

def str_stemmer(s):
    if isinstance(s, str):
        return " ".join([stemmer.stem(word) for word in s.lower().split()])
    else:
        return "null"

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt


In [50]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')


In [51]:
#below you see that relevance field is NaN for what df_test
df_all[200000:]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand
200000,188423,Hampton Bay 36x34.5x24 in. Cambria Sink Base C...,180429,,36x94,The furniture-quality finish on the Hampton Ba...,Hampton Bay
200001,188424,Hampton Bay 36x34.5x24 in. Cambria Sink Base C...,180429,,cambria java refridgerator,The furniture-quality finish on the Hampton Ba...,Hampton Bay
200002,188425,Glacier Bay 24 in. x 1-1/2 in. Concealed Peene...,180430,,glacier bar sin,The Glacier Bay 24 in. x 1-1/2 in. Concealed P...,Glacier Bay
200003,188426,Arizona&#39;s Best 5 lb. Tree and Shrub Food D...,180431,,eb stone tree food,Tree and Shrub Food is packaged in a convenien...,Arizona's Best
200004,188430,Veradek 22 in. x 17.5 in. Stone Grey Venetian ...,180434,,small flower pot stones,Featuring a traditional Mediterranean Urn desi...,Veradek
200005,188432,Arlington Industries 1 in. 90-Degree Non-Metal...,180436,,liquid itght non metalic,The ARLINGTON INDUSTRIES INC 1 in. 90-Degree N...,Arlington Industries
200006,188433,Arlington Industries 1 in. 90-Degree Non-Metal...,180436,,liquid tight romex connector,The ARLINGTON INDUSTRIES INC 1 in. 90-Degree N...,Arlington Industries
200007,188435,Westinghouse 1-Light Black Exterior Wall Lante...,180438,,black exterior chandelier,This Westinghouse Exterior Wall Lantern featur...,Westinghouse
200008,188438,Filament Design Johnson 4-Light Rustic Iron In...,180441,,iron bath vanity light,The Johnson Collection is well known for quali...,Filament Design
200009,188439,Master Flow 13 in. x 18 in. Fixed Chimney Cap,180442,,flange mount chimney cap,Master Flow Chimney caps provide protection ag...,Master Flow


In [52]:
list(df_all)

['id',
 'product_title',
 'product_uid',
 'relevance',
 'search_term',
 'product_description',
 'brand']

## Features

In [53]:
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))

In [54]:
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))

In [55]:
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [56]:
df_all['brand'] = df_all['brand'].map(lambda x:str_stemmer(x))

In [57]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

In [58]:
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)

In [59]:
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)

In [60]:
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

In [61]:
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

In [62]:
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

In [63]:
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [64]:
df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))

In [65]:
df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

In [66]:
df_all['query_last_word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0].split(" ")[-1],x.split('\t')[1]))

In [67]:
df_all['query_last_word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0].split(" ")[-1],x.split('\t')[2]))

In [68]:
df_all['ratio_title'] = df_all['word_in_title'] / df_all['len_of_query']

In [69]:
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']

In [70]:
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']

In [71]:
df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

In [72]:
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']

In [73]:
df_brand = pd.unique(df_all.brand.ravel())

In [74]:
df_all[200000:]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand,len_of_query,len_of_title,len_of_description,...,word_in_description,query_in_title,query_in_description,query_last_word_in_title,query_last_word_in_description,ratio_title,ratio_description,attr,word_in_brand,ratio_brand
200000,188423,hampton bay 36x34.5x24 in. cambria sink base c...,180429,,36x94,the furniture-qu finish on the hampton bay bas...,hampton bay,1,10,100,...,0,0,0,0,0,0.000000,0.000000,36x94\thampton bay,0,0.000000
200001,188424,hampton bay 36x34.5x24 in. cambria sink base c...,180429,,cambria java refridger,the furniture-qu finish on the hampton bay bas...,hampton bay,3,10,100,...,0,0,0,0,0,0.666667,0.000000,cambria java refridger\thampton bay,0,0.000000
200002,188425,glacier bay 24 in. x 1-1/2 in. conceal peen gr...,180430,,glacier bar sin,the glacier bay 24 in. x 1-1/2 in. conceal pee...,glacier bay,3,15,109,...,2,0,0,0,0,0.666667,0.666667,glacier bar sin\tglacier bay,1,0.500000
200003,188426,arizona&#39; best 5 lb. tree and shrub food dr...,180431,,eb stone tree food,tree and shrub food is packag in a conveni 5 l...,arizona best,4,10,97,...,2,0,0,1,1,0.500000,0.500000,eb stone tree food\tarizona best,0,0.000000
200004,188430,veradek 22 in. x 17.5 in. stone grey venetian ...,180434,,small flower pot stone,"featur a tradit mediterranean urn design, the ...",veradek,4,12,218,...,1,0,0,1,1,0.250000,0.250000,small flower pot stone\tveradek,0,0.000000
200005,188432,arlington industri 1 in. 90-degre non-metal li...,180436,,liquid itght non metal,the arlington industri inc 1 in. 90-degre non-...,arlington industri,4,9,48,...,3,0,0,1,1,0.750000,0.750000,liquid itght non metal\tarlington industri,0,0.000000
200006,188433,arlington industri 1 in. 90-degre non-metal li...,180436,,liquid tight romex connector,the arlington industri inc 1 in. 90-degre non-...,arlington industri,4,9,48,...,3,0,0,1,1,0.750000,0.750000,liquid tight romex connector\tarlington industri,0,0.000000
200007,188435,westinghous 1-light black exterior wall lanter...,180438,,black exterior chandeli,this westinghous exterior wall lantern featur ...,westinghous,3,15,210,...,2,0,0,0,0,0.666667,0.666667,black exterior chandeli\twestinghous,0,0.000000
200008,188438,filament design johnson 4-light rustic iron in...,180441,,iron bath vaniti light,"the johnson collect is well known for quality,...",filament design,4,10,82,...,3,0,0,1,1,1.000000,0.750000,iron bath vaniti light\tfilament design,0,0.000000
200009,188439,master flow 13 in. x 18 in. fix chimney cap,180442,,flang mount chimney cap,master flow chimney cap provid protect against...,master flow,4,10,97,...,2,0,0,1,1,0.500000,0.500000,flang mount chimney cap\tmaster flow,0,0.000000


In [75]:
list(df_all)

['id',
 'product_title',
 'product_uid',
 'relevance',
 'search_term',
 'product_description',
 'brand',
 'len_of_query',
 'len_of_title',
 'len_of_description',
 'len_of_brand',
 'product_info',
 'word_in_title',
 'word_in_description',
 'query_in_title',
 'query_in_description',
 'query_last_word_in_title',
 'query_last_word_in_description',
 'ratio_title',
 'ratio_description',
 'attr',
 'word_in_brand',
 'ratio_brand']

In [76]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info','attr','brand'],axis=1)

In [77]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

In [78]:
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [79]:
rf = LinearRegression()

In [80]:
rf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [81]:
y_pred = rf.predict(X_test)

In [82]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('linearRegressionSubmission.csv',index=False)
                                                          