## Initial procedures

In [1]:
import pandas as pd
import matplotlib as mpl
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

In [17]:
folder = 'c:/users/jeff/dropbox/kaggle_home_depot/'
train_path = folder + 'train.csv'
test_path = folder + 'test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

combined = train.append(test)
combined = combined.reset_index().drop('index', axis=1)

## Data cleaning & engineering

In [3]:
# I have some rows that are just NaNs

def cleanNans(attributes):

    return attributes[~attributes['product_uid'].isnull() |
                      ~attributes['name'].isnull() |
                      ~attributes['value'].isnull()]

In [4]:
def standardizeBullets(attributes):
    
    attributes.loc[attributes['name'].str.contains('Bullet'),
                  'name']['name'] = 'Bullet'
    return attributes

In [5]:
def get_uids(df, name_value):
    
    product_uids = df[df['name'] == name_value]['product_uid'].unique()
    
    return product_uids

In [6]:
def get_bulletpointwords(attributes, product_uid):
    
    bullets = attributes[(attributes['name'] == 'Bullet') &
                        (attributes['product_uid'] == product_uid)]
    
    bullet_words = []
    
    for index,row in bullets.iterrows():
        bullet_point_words = row['value'].split(' ')
        for word in bullet_point_words:
            if word not in bullet_words:
                bullet_words.append(word)
    
    return bullet_words

In [8]:
def getBrands(df):

    product_uids = df['product_uid'].unique()
    
    return brand_list

In [31]:
def breakDownQueryNames(df):
    
    # Lowercase for everything to normalize
    df['query_terms'] = df['search_term'].str.lower()
    df['productname_terms'] = df['product_title'].str.lower()
    df['query_terms'] = df['query_terms'].str.split(' ')
    df['productname_terms'] = df['productname_terms'].str.split(' ')
    
    return df

In [21]:
def removeLists(df):
    
    return df.drop(['query_terms','productname_terms'],axis=1)

In [22]:
def percentQueryInProductName(df):
    
    df = breakDownQueryNames(df)
    
    df['percentQueryInName'] = pd.Series()
    
    for i in range(len(df['query_terms'])):

        numQueryTerms = len(df['query_terms'][i])
        numNameTerms = len(df['productname_terms'][i])
        queryTermsInName = 0

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermsInName += 1

        df.loc[i,'percentQueryInName'] = float(queryTermsInName) / numNameTerms
        
    df = removeLists(df)
    
    return df

In [23]:
def hasOneMatch(df):
    
    df = breakDownQueryNames(df)
    
    df['hasMatch'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        numQueryTerms = len(df['query_terms'][i])
        queryTermInName = False

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermInName = True

        df.loc[i,'hasMatch'] = queryTermInName
    
    df = removeLists(df)
    
    return df

In [24]:
def isNumber(string):
    
    try:
        float(string)
        return True
    except ValueError:
        return False

def queryProductHaveNumeric(df):
    
    df['BothHaveNumbers'] = pd.Series()
    df = breakDownQueryNames(df)
    
    for i in range(len(df['query_terms'])):
    
        queryNumber = False
        productNumber = False

        for word in df['query_terms'][i]:

            if isNumber(str(word)):

                queryNumber = True

        for word in df['productname_terms'][i]:

            if isNumber(str(word)):

                productNumber = True

        if (queryNumber and productNumber):

            df['BothHaveNumbers'] = True

        else:

            df['BothHaveNumbers'] = False
            
        df = removeLists(df)
    
    return df

In [15]:
def dropUnnecessaryData(df):
    
    df = df.drop(['product_uid','product_title','search_term'], axis=1)
    
    return df

## First try: simple features, Random Forests with regression

I don't want to start with anything too complex, so I'll keep the scripts very simple. For this first try, I will employ a simple regression model. The reason I will use that sort of model is because the training set contains a "Relevance" parameter that is continuous in [1,3]. The goal here is to predict what a human would write down, and that set is

$$G={1,2,3}$$

After I make my set of predictions with Random Forests, I will round to the nearest integer in the set **G** to obtain the prediction as per the specifications of the problem.

In [25]:
combined = percentQueryInProductName(combined)

In [28]:
combined = hasOneMatch(combined)

In [29]:
combined

Unnamed: 0,id,product_title,product_uid,relevance,search_term,percentQueryInName,hasMatch
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.00,angle bracket,0.000000,False
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.50,l bracket,0.000000,False
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.00,deck over,0.000000,False
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,0.000000,False
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,0.000000,False
5,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,convection otr,0.000000,False
6,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove,0.000000,False
7,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,microwaves,0.000000,False
8,23,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light,0.000000,False
9,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.00,mdf 3/4,0.071429,True
