In [377]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from difflib import SequenceMatcher, ndiff
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from nltk.stem.snowball import SnowballStemmer
from jellyfish import levenshtein_distance, damerau_levenshtein_distance, hamming_distance, jaro_similarity, jaro_winkler_similarity



import spacy
# from time import time

# Loading

In [390]:
stemmer = SnowballStemmer('english')
# nlp = spacy.load("en_core_web_md")

df_train = pd.read_csv('Contents/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('Contents/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('Contents/attributes.csv')
df_pro_desc = pd.read_csv('Contents/product_descriptions.csv')

# Select random subset of training data

In [313]:
rng=np.random.RandomState(12)

train_indices = rng.choice(74067, replace=False, size=12000)
df_train = df_train.iloc[train_indices]

test_indices = rng.choice(166693, replace=False, size=9000)
df_test = df_test.iloc[test_indices]

# Functions

In [406]:
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

def levenshtein(input_string, reference_string):
    # if reference_string == 'nan':
    #     score = np.nan
    # else:
    score = 1-levenshtein_distance(input_string, reference_string)
    return score

def check_for_numeral(input_string):
    if any(char.isdigit() for char in input_string):
        score = 1
    else:
        score = 0
    return score

def check_for_unit(input_string):
    score = 0
    units = ['lb', 'in', 'cu', 'ft', 'min', 'oz']
    for unit in units:
        if unit in input_string:
            score += 1
    return score

def average_word_length(input_string):
    letter_sentence = re.findall(r'\w+', input_string)

    # Calculate the total number of characters and the number of words
    total_chars = len(''.join(letter_sentence))
    num_words = len(letter_sentence)

    # Compute the average word length
    avg_word_length = total_chars / float(num_words)

    return avg_word_length

def extract_numeric_before_string(input_string, specified_string):
    pattern = rf"(\d+)\s*{re.escape(specified_string)}"

    match = re.search(pattern, input_string)

    if match:
        numeric_value = float(match.group(1))
        return numeric_value
    else:
        return 'nan'

def matched_numeric(input_string, measurements):
    score = 0
    units = ['in','in','in','lb']
    for u_id, unit in enumerate(units):
        if measurements[u_id] == 'nan':
            continue
        if unit in input_string:
                try:
                    value = float(extract_numeric_before_string(input_string, unit))
                    if value == 'nan':
                        pass
                    lower_limit = float(measurements[u_id]) * 0.9
                    upper_limit = float(measurements[u_id]) * 1.1
                    if lower_limit <= value <= upper_limit:
                        score += 1
                except ValueError:
                    pass
#            if str(measurements[u_id]) in input_string:
#                score +=1
    return score

# Preprocessing

In [392]:
num_train = df_train.shape[0]

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

# attributes
color_attribute = df_attr.loc[df_attr['name']=='Color Family'].drop(['name'], axis=1).rename(columns={'value':'color_family'})
brand_name = df_attr.loc[df_attr['name']=='MFG Brand Name'].drop(['name'], axis=1).rename(columns={'value':'brand_name'})

width = df_attr.loc[df_attr['name']=="Product Width (in.)"].drop(['name'], axis=1).rename(columns={'value':'width'})
height = df_attr.loc[df_attr['name']=="Product Height (in.)"].drop(['name'], axis=1).rename(columns={'value':'height'})
depth = df_attr.loc[df_attr['name']=="Product Depth (in.)"].drop(['name'], axis=1).rename(columns={'value':'depth'})
weight = df_attr.loc[df_attr['name']=="Product Weight (lb.)"].drop(['name'], axis=1).rename(columns={'value':'weight'})

df_all = pd.merge(df_all, color_attribute, how="left", on="product_uid")
df_all = pd.merge(df_all, brand_name, how="left", on="product_uid")
df_all = pd.merge(df_all, width, how="left", on="product_uid")
df_all = pd.merge(df_all, height, how="left", on="product_uid")
df_all = pd.merge(df_all, depth, how="left", on="product_uid")
df_all = pd.merge(df_all, weight, how="left", on="product_uid")

# stemming
df_all['color_family'] = df_all['color_family'].map(lambda x:str_stemmer(str(x)))
df_all['brand_name'] = df_all['brand_name'].map(lambda x:str_stemmer(str(x)))

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

# quantitative data
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
# df_all['word_length'] = df_all.apply(lambda x: average_word_length(x['search_term']), axis=1)

# product info
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']
df_all['measurements'] = df_all.apply(lambda row: [row['width'], row['height'], row['depth'], row['weight']], axis=1)

# similarity data
df_all['title_similarity'] = df_all.apply(lambda x: str_common_word(x['search_term'], x['product_title']), axis=1)
df_all['description_similarity'] = df_all.apply(lambda x: str_common_word(x['search_term'], x['product_description']), axis=1)
df_all['color_similarity'] = df_all.apply(lambda x: levenshtein(x['search_term'], x['color_family']), axis=1)
df_all['brand_in_title'] = df_all.apply(lambda x: str_common_word(x['search_term'], x['brand_name']), axis=1)

# numerals + units
df_all['numeral_in_search'] = df_all.apply(lambda x: check_for_numeral(x['search_term']), axis=1)
df_all['unit_in_search'] = df_all.apply(lambda x: check_for_unit(x['search_term']), axis=1)

In [407]:
# df_all['matched_measurement'] = df_all.apply(lambda x: matched_numeric(x['search_term'], x['measurements']), axis=1)

In [0]:
df_save = df_all

In [408]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info','color_family', 'brand_name', 'height','width','depth', 'weight', 'measurements'],axis=1)

df_train = df_all.iloc[:num_train]

# Training split

In [414]:
# df_train = pd.read_csv('df_all_stemmed.csv')

X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['id','relevance'],axis=1), df_train['relevance'], test_size=0.20, random_state=42)
id_test = X_test['product_uid']
X_train = X_train.drop(['product_uid'], axis=1)
X_test = X_test.drop(['product_uid'], axis=1)

# id_test = X_test[:, 0]
# X_train = X_train[:,1:]
# X_test = X_test[:,1:]

#X_train = np.where(np.isnan(X_train), 0, X_train)
#X_test = np.where(np.isnan(X_test), 0, X_test)

# Base Model

In [350]:
# rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
# clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
#
# pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('week7_1.csv',index=False)

# Chosen Model (Decision Tree)

In [415]:
# Define the model
model = DecisionTreeRegressor(random_state=0)

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 3, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5]
}
# Set up the grid search
grid_cv = GridSearchCV(model, hyperparameter_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit it to the data and find the best hyperparameters
grid_fit = grid_cv.fit(X_train, y_train)

y_pred = grid_fit.predict(X_test)

coefs = grid_fit.best_estimator_.feature_importances_
names = grid_fit.best_estimator_.feature_names_in_

In [416]:
feature_weight = pd.DataFrame([coefs], columns=names)
feature_weight = feature_weight.T.sort_values(by=feature_weight.index[0], ascending=False).T
display(feature_weight)

Unnamed: 0,title_similarity,len_of_query,color_similarity,description_similarity,brand_in_title,matched_measurement
0,0.527103,0.276926,0.099491,0.078097,0.014961,0.003423


# Evaluation

In [417]:
def root_mean_squared_error(y_test, y_pred):
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	return rmse

rmse = root_mean_squared_error(y_test, y_pred)

print(rmse)

0.4810364294165028


# Scores:

Decision Tree
- subset, with stem, color, Jaro sim: 0.5408981607863628
- subset, with stem, color, Jaro-wink sim: 0.5411629191097017
- subset, with stem, color, damereau-lev sim: 0.5379426945018496
- subset, with stem, color, hamming dist: 0.5380565876357295
- subset, with stem, color, levenshtein dist: 0.5372710196551093
- subset, with stem, color, brand, levenshtein dist (only for color): 0.5297623475074125
- subset, with stem, color, brand, levenshtein dist (color and brand): 0.5227914391340877
- subset, with stem, color, brand, levenshtein dist (all features): 0.5320546275888145
- subset, with stem, color, brand, levenshtein dist (except title): 0.5126117989966398
- subset, with stem, color, brand, check_for_numeral levenshtein dist (except title): 0.5088087146629251
- subset, with stem, brand, check_for_numeral levenshtein dist (except title): 0.5106889448309334
- subset, with stem, color, brand, check_for_numeral, no levenshtein: 0.5016388501818894
- subset, with stem, color, brand, check_for_numeral levenshtein dist (except title), no NaN filtering: 0.5030290160948324



- full set, with stem, color, levenshtein dist: 0.5272314796949449
- full set, with stem, color, levenshtein dist (only for color): 0.5240342623526805
- full set, with stem, color, brand, check_for_numeral levenshtein dist (except title): 0.5101046665678989
- full set, with stem, color, brand, check_for_numeral levenshtein dist (except title): 0.49271495137454396
- full set, with stem, color, brand, check_for_numeral levenshtein dist (except title + description), no NaN filtering: 0.4817504691987292
- full set, with stem, brand, check_for_numeral levenshtein dist (except title + description), no NaN filtering: 0.4828603848093095
- fully stemmed set without additions: 0.4860708012587651
- full set, with stem, color, brand, check_for_numeral, check for unit, levenshtein dist (except title + description), no NaN filtering: 0.4810337659750626
- full set, with stem, color, brand, check_for_numeral, check for unit, average_word, levenshtein dist (except title + description), no NaN filtering: 0.4812782783434426
- full set, with stem, color, brand, check_for_numeral, check for unit, average_word, levenshtein dist (except title + description, brand), no NaN filtering: 0.48017502966180736


- original model with a 80/20 split: 0.4848881549747362


