# Using TensorFlow to Find the Relevance of Search Queries

## Initial procedures

In [1]:
import pandas as pd
import matplotlib as mpl
import numpy as np
import math
import os

In [2]:
folder = '/home/pbnjeff/Dropbox/KaggleHomeDepotData/'
cleaned_data_path = '/home/pbnjeff/Dropbox/KaggleHomeDepotData/combined_cleaned.csv'

train_path = folder + 'train.csv'
test_path = folder + 'test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

if os.path.exists(cleaned_data_path):
    combined = pd.read_csv(cleaned_data_path)
else:
    combined = train.append(test)
    combined = combined.reset_index(drop=True)

In [3]:
attributes = pd.read_csv('/home/pbnjeff/Dropbox/KaggleHomeDepotData/attributes.csv')

## Feature engineering

In [5]:
def breakDownQueryNames(df):
    
    # Lowercase for everything to normalize
    df['query_terms'] = df['search_term'].str.lower()
    df['query_terms'] = df['search_term'].str.replace('-',' ')
    df['productname_terms'] = df['product_title'].str.lower()
    df['productname_terms'] = df['productname_terms'].replace('-',' ')
    df['query_terms'] = df['query_terms'].str.split(' ')
    df['productname_terms'] = df['productname_terms'].str.split(' ')
    
    return df

def removeLists(df):
    """
    Helper function to remove unnecessary columns for training models
    """
    
    return df.drop(['query_terms','productname_terms','material_terms'],axis=1)

def percentQueryInProductName(df):
    
    
    df['percentQueryInName'] = pd.Series()
    
    for i in range(len(df['query_terms'])):

        numQueryTerms = len(df['query_terms'][i])
        numNameTerms = len(df['productname_terms'][i])
        queryTermsInName = 0

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermsInName += 1

        df.loc[i,'percentQueryInName'] = float(queryTermsInName) / numNameTerms
        
        printCompleted(i, len(df['query_terms']))
    
    return df

def printCompleted(i, total):
    
    if i % 10000 == 0:
        
        print('{0}'.format(str(i) + '/' + str(total) + ' completed!'))

def hasOneMatch(df):
    
    df['hasMatch'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        numQueryTerms = len(df['query_terms'][i])
        queryTermInName = False

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermInName = True

        df.loc[i,'hasMatch'] = queryTermInName
        
        printCompleted(i, len(df['query_terms']))
    
    return df

def isNumber(string):
    
    try:
        float(string)
        return True
    except ValueError:
        return False

def queryProductHaveNumeric(df):
    """
    Humans might input the wrong number, but the intention
    was to specify a number. This is a measure of whether
    a human had the intention.
    """
    
    df['BothHaveNumbers'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        queryNumber = False
        productNumber = False

        for word in df['query_terms'][i]:

            if isNumber(str(word)):

                queryNumber = True

        for word in df['productname_terms'][i]:

            if isNumber(str(word)):

                productNumber = True

        if (queryNumber and productNumber):

            df['BothHaveNumbers'] = True

        else:

            df['BothHaveNumbers'] = False
            
        printCompleted(i, len(df['query_terms']))

    return df

def materialHasMatch(df):
    
    df['MaterialMatch'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        hasMatch = False
        
        for term in df['query_terms'][i]:
            
            if term in df['material_terms'][i]:

                hasMatch = True
        
        df['MaterialMatch'][i] = hasMatch
        
        printCompleted(i, len(df['query_terms']))
    
    return df

def percentMaterialMatched(df):
    
    df['percentMaterialMatched'] = pd.Series()

    for i in range(len(df['query_terms'])):
        
        numMatches = 0
        numMaterialTerms = len(df['material_terms'][i])

        for term in df['query_terms'][i]:

            if term in df['material_terms'][i]:

                numMatches += 1

        if numMatches > numMaterialTerms:
            numMatches = numMaterialTerms

        df['percentMaterialMatched'][i] = float(numMatches) / numMaterialTerms

        printCompleted(i, len(df['query_terms']))
    
    return df

def getMaterials(attributes, df):
    
    uid_materials = attributes.loc[attributes['name'] == 'Material'].drop('name', axis=1).reset_index(drop=True)
    
    df['material_terms'] = pd.Series()
    uid_materials['material_terms'] = uid_materials['value'].str.lower()
    uid_materials['materials'] = uid_materials['material_terms'].str.split(' ')
    uid_materials = uid_materials.drop('material_terms', axis=1)
    
    for i in range(len(df['product_uid'])):
        
        uid = df['product_uid'][i]
        mat_df = uid_materials[uid_materials['product_uid']==uid]['materials'].to_frame()
        
        try:
            material_terms = mat_df.iloc[0]['materials']
        except IndexError:
            material_terms = ['']
            
        df['material_terms'][i] = material_terms
        
        try:
            if math.isnan(df['material_terms'][i]):
                df['material_terms'][i] = ['']
        except:
            pass
        
        # TODO: Eliminate the parenthesis surrounding things like '(mdf)'
        
    return df

In [72]:
combined = breakDownQueryNames(combined)

In [73]:
combined = percentQueryInProductName(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [74]:
combined = hasOneMatch(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [75]:
combined = queryProductHaveNumeric(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [76]:
combined = getMaterials(attributes, combined)

In [77]:
combined = materialHasMatch(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [78]:
combined = percentMaterialMatched(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [3]:
combined.head(1)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,query_terms,productname_terms,percentQueryInName,hasMatch,BothHaveNumbers,MaterialMatch,percentMaterialMatched,material_terms
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3,angle bracket,"['angle', 'bracket']","['simpson', 'strong-tie', '12-gauge', 'angle']",0.25,True,True,0,0,"['galvanized', 'steel']"


In [79]:
combined.to_csv(cleaned_data_path, index=False)

In [7]:
combined = removeLists(combined)

In [8]:
combined

Unnamed: 0,id,product_title,product_uid,relevance,search_term,percentQueryInName,hasMatch,BothHaveNumbers,MaterialMatch,percentMaterialMatched
0.0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.00,angle bracket,0.250000,True,True,0,0
1.0,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.50,l bracket,0.000000,False,True,0,0
2.0,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.00,deck over,0.000000,False,True,0,0
3.0,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,0.076923,True,True,0,0
4.0,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,0.230769,True,True,0,0
5.0,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,convection otr,0.066667,True,True,0,0
6.0,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove,0.133333,True,True,0,0
7.0,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,microwaves,0.000000,False,True,0,0
8.0,23,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light,0.111111,True,True,0,0
9.0,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.00,mdf 3/4,0.142857,True,True,0,0
