In [44]:
import numpy as np
import pandas as pd
import matplotlib as plt
import sys
import time

from sklearn.ensemble import RandomForestRegressor
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV


In [45]:

#test files
start_time = time.time()
preprocessed_features = 'vectorised_features.csv'
y_test_file = 'y_test.csv'
y_test_file = 'solutions.csv'

In [46]:
# function definitions - stemmer, common words, time, test, 
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

def print_time():
    print("--- %s seconds ---" % (time.time() - start_time))
    

In [48]:
#Load data
df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")[:500]
df_test = pd.read_csv('test.csv', encoding='ISO-8859-1')[:500]
df_pro_desc = pd.read_csv('product_descriptions.csv')
df_attr = pd.read_csv('attributes.csv')
print("reading data...")

#Data cleaning
stemmer = SnowballStemmer('english')
n_train = df_train.shape[0]
x_train, x_valid, x_test, y_train, y_valid, y_test = (0,0,0,0,0,0)

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how ='left', on='product_uid')
print("merging data...")

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_description']
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
print("Data has been pre-processed...")

df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
df_train = df_all.iloc[:n_train]
df_test = df_all.iloc[n_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
x_train = df_train.drop(['id','relevance'],axis=1).values
x_test = df_test.drop(['id','relevance'],axis=1).values
print("dropping stuff")

np.savetxt('y_train.csv', y_train, delimiter=",")
np.savetxt("x_train.csv", x_train, fmt= '%s',delimiter=",")
np.savetxt("x_test.csv", x_test, fmt='%s', delimiter=",")

reading data...
merging data...
Data has been pre-processed...
dropping stuff


In [59]:
x_train = np.genfromtxt('x_train.csv', delimiter=",")
X_test = np.genfromtxt("X_test.csv", delimiter=',')
y_train = np.genfromtxt("y_train.csv", delimiter=',')
n_train = int(0.75*x_train.shape[0])
n_valid = int(0.25*x_train.shape[0])
x_train = x_train[0:n_train, :]

X_valid = x_train[n_train+1:n_train+n_valid, :]

y_train = y_train[0:n_train]

y_valid = y_train[n_train+1:]

In [71]:
y_test = pd.read_csv('test.csv', encoding='ISO-8859-1')[:500]

In [72]:
y_test.to_csv('y_test.csv', columns=['id','product_uid','product_title'])

In [80]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

y_test = pd.read_csv('test.csv', encoding='ISO-8859-1')


In [87]:
y_test['product_title'] = y_test['product_title'].astype(object)
y_test['search_term'] = y_test['search_term'].astype(object)

In [90]:
y_test = y_test.astype(object)

In [95]:
y_test['product_title'] = y_test['product_title'].apply(word_tokenize)


In [96]:
y_test['search_term'] = y_test['search_term'].apply(word_tokenize)


In [99]:
stop = stopwords.words('english')
y_test['product_title'] = y_test['product_title'].apply(lambda x: [item for item in x if item not in stop])
y_test['search_term'] = y_test['search_term'].apply(lambda x: [item for item in x if item not in stop])


In [100]:
y_test

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[90, degree, bracket]"
1,4,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[metal, l, brackets]"
2,5,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[simpson, sku, able]"
3,6,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[simpson, strong, ties]"
4,7,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[simpson, strong, tie, hcc668]"
5,8,100001,"[Simpson, Strong-Tie, 12-Gauge, Angle]","[wood, connectors]"
6,10,100003,"[STERLING, Ensemble, 33-1/4, ., x, 60, ., x, 7...","[bath, shower, kit]"
7,11,100003,"[STERLING, Ensemble, 33-1/4, ., x, 60, ., x, 7...","[bath, drain, kit]"
8,12,100003,"[STERLING, Ensemble, 33-1/4, ., x, 60, ., x, 7...","[one, piece, tub, shower]"
9,13,100004,"[Grape, Solar, 265-Watt, Polycrystalline, Sola...","[solar, panel]"


In [101]:
y_test.to_csv("y_test.csv")

In [108]:
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


n_iter_search = 3

param_dist = {"max_depth": [5,10,20,50],

              "max_features": [2,3,4],

              "min_samples_split": [5,10,15],

              "min_samples_leaf": [5,10,15],

              "bootstrap": [True, False]}
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,

                n_iter=n_iter_search)

random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


NameError: name 'X_train' is not defined