In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from difflib import SequenceMatcher, ndiff
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
import spacy

# Loading

In [2]:
stemmer = SnowballStemmer('english')
nlp = spacy.load("en_core_web_md")

df_train = pd.read_csv('Contents/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('Contents/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('Contents/attributes.csv')
df_pro_desc = pd.read_csv('Contents/product_descriptions.csv')

# Select random subset of training data

In [4]:
rng=np.random.RandomState(12)

train_indices = rng.choice(74067, replace=False, size=12000)
df_train = df_train.iloc[train_indices]

test_indices = rng.choice(166693, replace=False, size=9000)
df_test = df_test.iloc[test_indices]

# Functions

In [5]:
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

def compute_similarity(input_string, reference_string):
    if reference_string == 'nan':
        score = np.nan
    else:
        diff = ndiff(input_string, reference_string)
        diff_count = 0
        for line in diff:
            if line.startswith("-"):
                diff_count += 1
        score = 1 - (diff_count / len(input_string))
    return score

# Preprocessing

In [7]:
num_train = df_train.shape[0]

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

color_attribute = df_attr.loc[df_attr['name']=='Color Family'].drop(['name'], axis=1).rename(columns={'value':'color_family'})
df_all = pd.merge(df_all, color_attribute, how="left", on="product_uid")
df_all['color_family'] = df_all['color_family'].map(lambda x:str_stemmer(str(x)))
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['title_similarity'] = df_all.apply(lambda x: compute_similarity(x['search_term'], x['product_title']), axis=1)
df_all['description_similarity'] = df_all.apply(lambda x: compute_similarity(x['search_term'], x['product_description']), axis=1)

search_term_nlp = [nlp(x) for x in df_all['search_term']]
colors_nlp = [nlp(x) for x in df_all['color_family']]
df_all['color_similarity'] = df_all['similarity_score'] = [search_term_nlp[i].similarity(colors_nlp[i]) for i in range(len(df_all))]

df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

df_train = df_all.iloc[:num_train]

KeyboardInterrupt: 

# Training split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['id','relevance'],axis=1).values, df_train['relevance'].values, test_size=0.20, random_state=42)

id_test = X_test[:, 0]
X_train = X_train[:,1:]
X_test = X_test[:,1:]

rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('week7_1.csv',index=False)

# Evaluation

In [None]:
def root_mean_squared_error(y_test, y_pred):
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	return rmse

rmse = root_mean_squared_error(y_test, y_pred)

print(rmse)