## Initial procedures

In [1]:
import pandas as pd
import matplotlib as mpl
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation

ImportError: No module named sklearn.ensemble

In [None]:
folder = 'c:/users/jeff/dropbox/kagglehomedepotdata/'
train_path = folder + 'train.csv'
test_path = folder + 'test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

combined = train.append(test)
combined = combined.reset_index().drop('index', axis=1)

## Data cleaning & engineering

Here's a bunch of functions that I use to clean up the data and engineer new features

In [None]:
# I have some rows that are just NaNs

def cleanNans(attributes):

    return attributes[~attributes['product_uid'].isnull() |
                      ~attributes['name'].isnull() |
                      ~attributes['value'].isnull()]

def standardizeBullets(attributes):
    
    attributes.loc[attributes['name'].str.contains('Bullet'),
                  'name']['name'] = 'Bullet'
    return attributes

def get_uids(df, name_value):
    
    product_uids = df[df['name'] == name_value]['product_uid'].unique()
    
    return product_uids

def get_bulletpointwords(attributes, product_uid):
    
    bullets = attributes[(attributes['name'] == 'Bullet') &
                        (attributes['product_uid'] == product_uid)]
    
    bullet_words = []
    
    for index,row in bullets.iterrows():
        bullet_point_words = row['value'].split(' ')
        for word in bullet_point_words:
            if word not in bullet_words:
                bullet_words.append(word)
    
    return bullet_words

def getBrands(df):

    product_uids = df['product_uid'].unique()
    
    return brand_list

def breakDownQueryNames(df):
    
    # Lowercase for everything to normalize
    df['query_terms'] = df['search_term'].str.lower()
    df['productname_terms'] = df['product_title'].str.lower()
    df['query_terms'] = df['query_terms'].str.split(' ')
    df['productname_terms'] = df['productname_terms'].str.split(' ')
    
    return df

def removeLists(df):
    
    return df.drop(['query_terms','productname_terms'],axis=1)

def percentQueryInProductName(df):
    
    df = breakDownQueryNames(df)
    
    df['percentQueryInName'] = pd.Series()
    
    for i in range(len(df['query_terms'])):

        numQueryTerms = len(df['query_terms'][i])
        numNameTerms = len(df['productname_terms'][i])
        queryTermsInName = 0

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermsInName += 1

        df.loc[i,'percentQueryInName'] = float(queryTermsInName) / numNameTerms
        
        printCompleted(i, len(df['query_terms']))
        
    df = removeLists(df)
    
    return df

def printCompleted(i, total):
    
    if i % 10000 == 0:
        
        print('{0}'.format(str(i) + '/' + str(total) + ' completed!'))

def hasOneMatch(df):
    
    df = breakDownQueryNames(df)
    
    df['hasMatch'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        numQueryTerms = len(df['query_terms'][i])
        queryTermInName = False

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermInName = True

        df.loc[i,'hasMatch'] = queryTermInName
        
        printCompleted(i, len(df['query_terms']))
    
    df = removeLists(df)
    
    return df

def isNumber(string):
    
    try:
        float(string)
        return True
    except ValueError:
        return False

def queryProductHaveNumeric(df):
    
    df['BothHaveNumbers'] = pd.Series()
    
    df = breakDownQueryNames(df)
    
    for i in range(len(df['query_terms'])):
        
        queryNumber = False
        productNumber = False

        for word in df['query_terms'][i]:

            if isNumber(str(word)):

                queryNumber = True

        for word in df['productname_terms'][i]:

            if isNumber(str(word)):

                productNumber = True

        if (queryNumber and productNumber):

            df['BothHaveNumbers'] = True

        else:

            df['BothHaveNumbers'] = False
            
        printCompleted(i, len(df['query_terms']))

    df = removeLists(df)
    
    return df

def dropUnnecessaryData(df):
    
    df = df.drop(['product_uid','product_title','search_term'], axis=1)
    
    return df

def materialMatches(df):
    
    return df

## First try: simple features, Random Forests with regression

I don't want to start with anything too complex, so I'll keep the scripts very simple. For this first try, I will employ a simple regression model. The reason I will use that sort of model is because the training set contains a "Relevance" parameter that is continuous in [1,3]. The goal here is to predict what a human would write down, and that set is

$$G={1,2,3}$$

After I make my set of predictions with Random Forests, I will round to the nearest integer in the set **G** to obtain the prediction as per the specifications of the problem.

In [11]:
combined = percentQueryInProductName(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [12]:
combined = hasOneMatch(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [13]:
combined = queryProductHaveNumeric(combined)

0/240760 completed!
10000/240760 completed!
20000/240760 completed!
30000/240760 completed!
40000/240760 completed!
50000/240760 completed!
60000/240760 completed!
70000/240760 completed!
80000/240760 completed!
90000/240760 completed!
100000/240760 completed!
110000/240760 completed!
120000/240760 completed!
130000/240760 completed!
140000/240760 completed!
150000/240760 completed!
160000/240760 completed!
170000/240760 completed!
180000/240760 completed!
190000/240760 completed!
200000/240760 completed!
210000/240760 completed!
220000/240760 completed!
230000/240760 completed!
240000/240760 completed!


In [46]:
combined = materialMatches(combined)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,percentQueryInName,hasMatch,BothHaveNumbers
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.00,angle bracket,0.250000,True,True
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.50,l bracket,0.000000,False,True
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.00,deck over,0.000000,False,True
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,0.076923,True,True
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,0.230769,True,True
5,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,convection otr,0.066667,True,True
6,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove,0.133333,True,True
7,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,microwaves,0.000000,False,True
8,23,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light,0.111111,True,True
9,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.00,mdf 3/4,0.142857,True,True


In [32]:
train_cleaned = combined[~combined['relevance'].isnull()]
test_cleaned = combined[combined['relevance'].isnull()]

In [45]:
train_X = train_cleaned.drop(['id','product_title','product_uid','relevance','search_term'], axis=1)
train_y = train_cleaned.drop(['id','product_title','product_uid','search_term','percentQueryInName','hasMatch'], 
                            axis = 1)

In [48]:
rf = RandomForestRegressor(n_estimators = 10, min_samples_split = 1, random_state = 0)

In [50]:
rf.fit(train_X, train_y['relevance'].ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [57]:
test_X = test_cleaned.drop(['id','product_uid','product_title','relevance','search_term'], axis = 1)

In [59]:
predictions = rf.predict(test_X)

In [65]:
predictions = np.rint(predictions)

In [66]:
predictions

array([ 2.,  2.,  2., ...,  2.,  2.,  3.])

In [67]:
submission = pd.DataFrame()

submission['id'] = test_cleaned['id']

submission['relevance'] = predictions

out_path = 'c:/users/jeff/dropbox/kaggle_home_depot/submission.csv'
submission.to_csv(out_path, index = False)