## Initial procedures

In [63]:
import pandas as pd
import matplotlib as mpl
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation

In [2]:
folder = 'c:/users/jeff/dropbox/kaggle_home_depot_data/'
train_path = folder + 'train.csv'
test_path = folder + 'test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

combined = train.append(test)
combined = combined.reset_index().drop('index', axis=1)

## Data cleaning & engineering

In [5]:
# I have some rows that are just NaNs

def cleanNans(attributes):

    return attributes[~attributes['product_uid'].isnull() |
                      ~attributes['name'].isnull() |
                      ~attributes['value'].isnull()]

In [6]:
def standardizeBullets(attributes):
    
    attributes.loc[attributes['name'].str.contains('Bullet'),
                  'name']['name'] = 'Bullet'
    return attributes

In [7]:
def get_uids(df, name_value):
    
    product_uids = df[df['name'] == name_value]['product_uid'].unique()
    
    return product_uids

In [8]:
def get_bulletpointwords(attributes, product_uid):
    
    bullets = attributes[(attributes['name'] == 'Bullet') &
                        (attributes['product_uid'] == product_uid)]
    
    bullet_words = []
    
    for index,row in bullets.iterrows():
        bullet_point_words = row['value'].split(' ')
        for word in bullet_point_words:
            if word not in bullet_words:
                bullet_words.append(word)
    
    return bullet_words

In [9]:
def getBrands(df):

    product_uids = df['product_uid'].unique()
    
    return brand_list

In [10]:
def breakDownQueryNames(df):
    
    # Lowercase for everything to normalize
    df['query_terms'] = df['search_term'].str.lower()
    df['productname_terms'] = df['product_title'].str.lower()
    df['query_terms'] = df['query_terms'].str.split(' ')
    df['productname_terms'] = df['productname_terms'].str.split(' ')
    
    return df

In [11]:
def removeLists(df):
    
    return df.drop(['query_terms','productname_terms'],axis=1)

In [12]:
def percentQueryInProductName(df):
    
    df = breakDownQueryNames(df)
    
    df['percentQueryInName'] = pd.Series()
    
    for i in range(len(df['query_terms'])):

        numQueryTerms = len(df['query_terms'][i])
        numNameTerms = len(df['productname_terms'][i])
        queryTermsInName = 0

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermsInName += 1

        df.loc[i,'percentQueryInName'] = float(queryTermsInName) / numNameTerms
        
        printCompleted(i, len(df['query_terms']))
        
    df = removeLists(df)
    
    return df

In [27]:
def printCompleted(i, total):
    
    if i % 5000 == 0:
        
        print('{0}'.format(str(i) + '/' + str(total) + ' completed!'))

In [28]:
def hasOneMatch(df):
    
    df = breakDownQueryNames(df)
    
    df['hasMatch'] = pd.Series()
    
    for i in range(len(df['query_terms'])):
        
        numQueryTerms = len(df['query_terms'][i])
        queryTermInName = False

        for j in range(numQueryTerms):

            if df['query_terms'][i][j] in df['productname_terms'][i]:

                queryTermInName = True

        df.loc[i,'hasMatch'] = queryTermInName
        
        printCompleted(i, len(df['query_terms']))
    
    df = removeLists(df)
    
    return df

In [15]:
def isNumber(string):
    
    try:
        float(string)
        return True
    except ValueError:
        return False

def queryProductHaveNumeric(df):
    
    df['BothHaveNumbers'] = pd.Series()
    df = breakDownQueryNames(df)
    
    for i in range(len(df['query_terms'])):
    
        queryNumber = False
        productNumber = False

        for word in df['query_terms'][i]:

            if isNumber(str(word)):

                queryNumber = True

        for word in df['productname_terms'][i]:

            if isNumber(str(word)):

                productNumber = True

        if (queryNumber and productNumber):

            df['BothHaveNumbers'] = True

        else:

            df['BothHaveNumbers'] = False
            
        df = removeLists(df)
    
    return df

In [16]:
def dropUnnecessaryData(df):
    
    df = df.drop(['product_uid','product_title','search_term'], axis=1)
    
    return df

## First try: simple features, Random Forests with regression

I don't want to start with anything too complex, so I'll keep the scripts very simple. For this first try, I will employ a simple regression model. The reason I will use that sort of model is because the training set contains a "Relevance" parameter that is continuous in [1,3]. The goal here is to predict what a human would write down, and that set is

$$G={1,2,3}$$

After I make my set of predictions with Random Forests, I will round to the nearest integer in the set **G** to obtain the prediction as per the specifications of the problem.

In [21]:
combined = percentQueryInProductName(combined)

0/240760 completed!
1000/240760 completed!
2000/240760 completed!
3000/240760 completed!
4000/240760 completed!
5000/240760 completed!
6000/240760 completed!
7000/240760 completed!
8000/240760 completed!
9000/240760 completed!
10000/240760 completed!
11000/240760 completed!
12000/240760 completed!
13000/240760 completed!
14000/240760 completed!
15000/240760 completed!
16000/240760 completed!
17000/240760 completed!
18000/240760 completed!
19000/240760 completed!
20000/240760 completed!
21000/240760 completed!
22000/240760 completed!
23000/240760 completed!
24000/240760 completed!
25000/240760 completed!
26000/240760 completed!
27000/240760 completed!
28000/240760 completed!
29000/240760 completed!
30000/240760 completed!
31000/240760 completed!
32000/240760 completed!
33000/240760 completed!
34000/240760 completed!
35000/240760 completed!
36000/240760 completed!
37000/240760 completed!
38000/240760 completed!
39000/240760 completed!
40000/240760 completed!
41000/240760 completed!
42000

In [29]:
combined = hasOneMatch(combined)

0/240760 completed!
5000/240760 completed!
10000/240760 completed!
15000/240760 completed!
20000/240760 completed!
25000/240760 completed!
30000/240760 completed!
35000/240760 completed!
40000/240760 completed!
45000/240760 completed!
50000/240760 completed!
55000/240760 completed!
60000/240760 completed!
65000/240760 completed!
70000/240760 completed!
75000/240760 completed!
80000/240760 completed!
85000/240760 completed!
90000/240760 completed!
95000/240760 completed!
100000/240760 completed!
105000/240760 completed!
110000/240760 completed!
115000/240760 completed!
120000/240760 completed!
125000/240760 completed!
130000/240760 completed!
135000/240760 completed!
140000/240760 completed!
145000/240760 completed!
150000/240760 completed!
155000/240760 completed!
160000/240760 completed!
165000/240760 completed!
170000/240760 completed!
175000/240760 completed!
180000/240760 completed!
185000/240760 completed!
190000/240760 completed!
195000/240760 completed!
200000/240760 completed!


In [32]:
train_cleaned = combined[~combined['relevance'].isnull()]
test_cleaned = combined[combined['relevance'].isnull()]

In [45]:
train_X = train_cleaned.drop(['id','product_title','product_uid','relevance','search_term'], axis=1)
train_y = train_cleaned.drop(['id','product_title','product_uid','search_term','percentQueryInName','hasMatch'], 
                            axis = 1)

In [48]:
rf = RandomForestRegressor(n_estimators = 10, min_samples_split = 1, random_state = 0)

In [50]:
rf.fit(train_X, train_y['relevance'].ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [57]:
test_X = test_cleaned.drop(['id','product_uid','product_title','relevance','search_term'], axis = 1)

In [59]:
predictions = rf.predict(test_X)

In [65]:
predictions = np.rint(predictions)

In [66]:
predictions

array([ 2.,  2.,  2., ...,  2.,  2.,  3.])

In [67]:
submission = pd.DataFrame()

In [68]:
submission['id'] = test_cleaned['id']

In [69]:
submission['relevance'] = predictions