In [1]:
import pandas as pd
import numpy as np
from os.path import join
import Levenshtein as lev
import math
from nltk.corpus import stopwords
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import gensim
from gensim.models.doc2vec import Doc2Vec
from gensim.models.keyedvectors import KeyedVectors
import gensim.downloader as api
import os

In [2]:
ltable = pd.read_csv(join('data', "ltable.csv"))
rtable = pd.read_csv(join('data', "rtable.csv"))
train = pd.read_csv(join('data', "train.csv"))

In [3]:
ltable['brand'] = ltable['brand'].astype(str)
rtable['brand'] = rtable['brand'].astype(str)

# get all brands
brands_l = set(ltable["brand"].values)
brands_r = set(rtable["brand"].values)
brands = brands_l.union(brands_r)

In [4]:
brand2ids_l = {b.lower(): [[],[]] for b in brands}
brand2ids_r = {b.lower(): [[],[]] for b in brands}

# group the records by brand, recording their id and model numbers
for i, x in ltable.iterrows():
    brand2ids_l[x["brand"].lower()][0].append(x["id"])
    brand2ids_l[x["brand"].lower()][1].append(x["modelno"])
for i, x in rtable.iterrows():
    brand2ids_r[x["brand"].lower()][0].append(x["id"])
    brand2ids_r[x["brand"].lower()][1].append(x["modelno"])
            
candset = []
for brd in brands:
    l_ids = brand2ids_l[brd][0]
    l_modelnos = brand2ids_l[brd][1]
    
    if brd.lower() != "nan":
        # if the record has a brand, it can match with all matching brands
        r_ids = brand2ids_r[brd][0]
        r_modelnos = brand2ids_r[brd][1]
        for i in range(len(l_ids)):
            for j in range(len(r_ids)):
                if ((type(l_modelnos[i]) == float) and (math.isnan(l_modelnos[i]))) or ((type(r_modelnos[j]) == float) and (math.isnan(r_modelnos[j]))):
                    # it is a candidate pair if either model number is not input
                    candset.append([l_ids[i], r_ids[j]])
                elif lev.distance(l_modelnos[i],r_modelnos[j]) < 5:
                    # it is a candidate pair if the model numbers are similar
                    candset.append([l_ids[i], r_ids[j]])
                    
        # this record may also be a match with all records that have no brand
        nan_r_ids = brand2ids_r["nan"][0]
        nan_r_modelnos = brand2ids_r["nan"][1]
        for i in range(len(l_ids)):
            for j in range(len(nan_r_ids)):
                if ((type(l_modelnos[i]) == float) and (math.isnan(l_modelnos[i]))) or ((type(nan_r_modelnos[j]) == float) and (math.isnan(nan_r_modelnos[j]))):
                    # it is a candidate pair if either model number is not input
                    candset.append([l_ids[i], nan_r_ids[j]])
                elif lev.distance(l_modelnos[i],nan_r_modelnos[j]) < 5:
                    # it is a candidate pair if the model numbers are similar
                    candset.append([l_ids[i], nan_r_ids[j]])
                
    else:
        # if the record does not have a brand, it might be a match with any other record
        for sub_brd in brands:
            r_ids = brand2ids_r[sub_brd][0]
            r_modelnos = brand2ids_r[sub_brd][1]
            for i in range(len(l_ids)):
                for j in range(len(r_ids)):
                    if ((type(l_modelnos[i]) == float) and (math.isnan(l_modelnos[i]))) or ((type(r_modelnos[j]) == float) and (math.isnan(r_modelnos[j]))):
                        # it is a candidate pair if either model number is not input
                        candset.append([l_ids[i], r_ids[j]])
                    elif lev.distance(l_modelnos[i],r_modelnos[j]) < 5:
                        # it is a candidate pair if the model numbers are similar
                        candset.append([l_ids[i], r_ids[j]])
        
len(candset)

1696202

In [5]:
def pairs2LR(ltable, rtable, candset):
    ltable.index = ltable.id
    rtable.index = rtable.id
    pairs = np.array(candset)
    tpls_l = ltable.loc[pairs[:, 0], :]
    tpls_r = rtable.loc[pairs[:, 1], :]
    tpls_l.columns = [col + "_l" for col in tpls_l.columns]
    tpls_r.columns = [col + "_r" for col in tpls_r.columns]
    tpls_l.reset_index(inplace=True, drop=True)
    tpls_r.reset_index(inplace=True, drop=True)
    LR = pd.concat([tpls_l, tpls_r], axis=1)
    return LR

In [6]:
candset_df = pairs2LR(ltable, rtable, candset)
candset_df

Unnamed: 0,id_l,title_l,category_l,brand_l,modelno_l,price_l,id_r,title_r,category_r,brand_r,modelno_r,price_r
0,120,jvc kd-r320 cd receiver,car stereos,jvc,kdr320,70.00,131,jvc kd-r720 cd receiver with hd radio and dual...,satellite radio tuners adapters,jvc,kd-r720,106.86
1,120,jvc kd-r320 cd receiver,car stereos,jvc,kdr320,70.00,922,jvc kdr820 kd-r820bt kd-r820bt usb mp3 cd rece...,,jvc,kd-r820bt,
2,120,jvc kd-r320 cd receiver,car stereos,jvc,kdr320,70.00,923,jvc kd-r210 4 x 50 watts cd receiver,cd players,jvc,kdr210,89.99
3,120,jvc kd-r320 cd receiver,car stereos,jvc,kdr320,70.00,933,jvc kd-r310 in-dash cd receiver w front aux in...,cd-mp3 players,jvc,kdr310,
4,120,jvc kd-r320 cd receiver,car stereos,jvc,kdr320,70.00,1879,jvc everio gz-mg20 20 gb hard disk drive camco...,camcorders,jvc,gzmg20,
...,...,...,...,...,...,...,...,...,...,...,...,...
1696197,2379,axis gk-013 107-key ps 2 keyboard black,computers,axis,gk-013,9.72,5287,axis gm-344 optical web mouse with usb connector,computers accessories,axis,,13.96
1696198,2379,axis gk-013 107-key ps 2 keyboard black,computers,axis,gk-013,9.72,17467,new-axis gk-310 multimedia keyboard with usb c...,connectors adapters,axis,,18.00
1696199,2379,axis gk-013 107-key ps 2 keyboard black,computers,axis,gk-013,9.72,20613,axis 58016 thumb pro keypad,printer accessories,axis,,23.70
1696200,2379,axis gk-013 107-key ps 2 keyboard black,computers,axis,gk-013,9.72,21248,new-axis gm-344 optical web mouse with usb con...,mice,axis,,21.76


In [7]:
left_titles = np.unique(np.array(candset_df["title_l"]).reshape(len(candset_df["title_l"]),1))
right_titles = np.unique(np.array(candset_df["title_r"]).reshape(len(candset_df["title_r"]),1))
all_titles = np.concatenate([left_titles,right_titles],axis=0)
print(all_titles.shape)

(24155,)


In [8]:
remove_words = stopwords.words('english')
remove_punctuation = "&.,-/"

def read_corpus(documents, tokens_only=False):
    for i, line in enumerate(documents):
        tokens = [word for word in line.lower().split() if not word in remove_words]
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

title_corpus = list(read_corpus(all_titles))

In [9]:
d2v = Doc2Vec(vector_size=50, min_count=2, epochs=40)
d2v.build_vocab(title_corpus)
d2v.train(title_corpus, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [10]:
w2v = api.load('word2vec-google-news-300')

In [11]:
def oh_cos_similarity(row, attr):
    if (type(row[attr + "_l"]) == float) and math.isnan(row[attr + "_l"]):
        return 0.5
    if (type(row[attr + "_r"]) == float) and math.isnan(row[attr + "_r"]):
        return 0.5
    left_set = set(word for word in row[attr + "_l"].lower().split() if not word in remove_words)
    right_set = set(word for word in row[attr + "_r"].lower().split() if not word in remove_words)
    
    if min(len(left_set),len(right_set)) == 0:
        return 0.5
    
    left_phrase = []
    right_phrase = []
    vocabulary = left_set.union(right_set)
    for term in vocabulary:
        if term in left_set:
            left_phrase.append(1)
        else:
            left_phrase.append(0)
        if term in right_set:
            right_phrase.append(1)
        else:
            right_phrase.append(0)
    left_phrase = np.array(left_phrase)
    right_phrase = np.array(right_phrase)
    return (np.dot(left_phrase,right_phrase)/(np.linalg.norm(left_phrase)*np.linalg.norm(right_phrase)))

def cos_similarity(row, attr):
    if attr == "title":
        left_sentence = [word for word in row[attr + "_l"].lower().split() if not word in remove_words]
        right_sentence = [word for word in row[attr + "_r"].lower().split() if not word in remove_words]

        left_phrase = d2v.infer_vector(left_sentence)
        right_phrase = d2v.infer_vector(right_sentence)

        return (np.dot(left_phrase,right_phrase)/(np.linalg.norm(left_phrase)*np.linalg.norm(right_phrase)))
    
    else:
        if (type(row[attr + "_l"]) == float) and math.isnan(row[attr + "_l"]):
            return 0.5
        if (type(row[attr + "_r"]) == float) and math.isnan(row[attr + "_r"]):
            return 0.5
        
        left_sentence = []
        right_sentence = []
        for word in row[attr + "_l"].lower().split():
            if (word not in remove_words) and (word not in remove_punctuation):
                if (word in w2v.key_to_index.keys()):
                    left_sentence.append(word)
        for word in row[attr + "_r"].lower().split():
            if (word not in remove_words) and (word not in remove_punctuation):
                if (word in w2v.key_to_index.keys()):
                    right_sentence.append(word)
        
        if min(len(left_sentence),len(right_sentence)) == 0:
            return 0.5
        
        left_phrase = np.zeros(300)
        left_phrase += w2v[left_sentence[0]]
        right_phrase = np.zeros(300)
        right_phrase += w2v[right_sentence[0]]
        
        if len(left_sentence) > 1:
            for i in range(1,len(left_sentence)):
                left_phrase += w2v[left_sentence[i]]
        if len(right_sentence) > 1:
            for i in range(1,len(right_sentence)):
                right_phrase += w2v[right_sentence[i]]
                
        return (np.dot(left_phrase,right_phrase)/(np.linalg.norm(left_phrase)*np.linalg.norm(right_phrase)))

def jaccard_similarity(row, attr):
    if (type(row[attr + "_l"]) == float) and math.isnan(row[attr + "_l"]):
        return 0.5
    if (type(row[attr + "_r"]) == float) and math.isnan(row[attr + "_r"]):
        return 0.5
    x = set(row[attr + "_l"].lower().split())
    y = set(row[attr + "_r"].lower().split())
    return len(x.intersection(y)) / max(len(x), len(y))


def levenshtein_distance(row, attr):
    if (type(row[attr + "_l"]) == float) and math.isnan(row[attr + "_l"]):
        return 0
    if (type(row[attr + "_r"]) == float) and math.isnan(row[attr + "_r"]):
        return 0
    x = row[attr + "_l"].lower()
    y = row[attr + "_r"].lower()
    return lev.distance(x, y)

def price_difference(row):
    if math.isnan(row["price_l"]) or math.isnan(row["price_r"]):
        return 0
    price_diff = abs(row["price_l"] - row["price_r"])
    return price_diff

def feature_engineering(LR):
    #LR = LR.astype(str)
    attrs = ["title","category", "brand", "modelno"]
    features = []
    for attr in attrs:
        if attr in ["title","category"]:
            cos_sim = LR.apply(cos_similarity, attr=attr, axis=1)
            features.append(cos_sim)
        if attr in ["title","modelno"]:
            oh_cos = LR.apply(oh_cos_similarity, attr=attr, axis=1)
            features.append(oh_cos)
        if attr in ["title","modelno"]:
            j_sim = LR.apply(jaccard_similarity, attr=attr, axis=1)
            features.append(j_sim)
        if attr in ["title","category","modelno"]:
            l_dist = LR.apply(levenshtein_distance, attr=attr, axis=1)
            features.append(l_dist)
    price_diff = LR.apply(price_difference, axis=1)
    features.append(price_diff)
    features = np.array(features).T
    return features

candset_features = feature_engineering(candset_df)

In [12]:
training_pairs = list(map(tuple, train[["ltable_id", "rtable_id"]].values))
training_df = pairs2LR(ltable, rtable, training_pairs)
training_features = feature_engineering(training_df)
training_label = train.label.values

In [13]:
mi = mutual_info_classif(training_features,training_label)
feature_labs = ["title doc2vec", "title oh cos", "title Jaccard", "title Lev",
                "category word2vec", "category Lev",
                "modelno oh cos", "modelno Jaccard", "modelno Lev",
                "price diff"]
for i in range(len(mi)):
    print(str(mi[i])+"\t"+feature_labs[i])

0.008757324092393937	title doc2vec
0.07747051554856488	title oh cos
0.043557772626719116	title Jaccard
0.020583285066396506	title Lev
0.025972272943867614	category word2vec
0.006598494482846773	category Lev
0.12218550662095162	modelno oh cos
0.11981673014437577	modelno Jaccard
0.038033166599952484	modelno Lev
0.015366989121341712	price diff


In [14]:
rf = RandomForestClassifier(class_weight="balanced_subsample",n_jobs=-1)
rf_gscv = GridSearchCV(rf,scoring="f1",param_grid={
    "n_estimators":[50,100,200,300,400,500], "max_depth":[5,10,15,20,25]
},cv=5,n_jobs=-1)
rf_gscv.fit(training_features, training_label)
best_clf = rf_gscv.best_estimator_
print(rf_gscv.best_score_)
print(rf_gscv.best_params_)

0.7258721740488137
{'max_depth': 10, 'n_estimators': 300}


In [15]:
kf = StratifiedKFold(n_splits=5)

f1s = []
recalls = []
precisions = []
for train_index, test_index in kf.split(training_features, training_label):
    reduced_data_train = training_features[train_index]
    reduced_labels_train = training_label[train_index]
    
    reduced_data_test = training_features[test_index]
    reduced_labels_test = training_label[test_index]
    
    clf = RandomForestClassifier(n_estimators=300,max_depth=10,class_weight="balanced_subsample")
    clf.fit(reduced_data_train, reduced_labels_train)
    
    y_pred = clf.predict(reduced_data_test)
    f1s.append(f1_score(reduced_labels_test,y_pred))
    recalls.append(recall_score(reduced_labels_test,y_pred))
    precisions.append(precision_score(reduced_labels_test,y_pred))
f1s = np.array(f1s)
recalls = np.array(recalls)
precisions = np.array(precisions)
print("F1:\t\t{}\tVar:\t{}".format(f1s.mean(),f1s.std()))
print("Recall:\t\t{}\tVar:\t{}".format(recalls.mean(),recalls.std()))
print("Precision:\t{}\tVar:\t{}".format(precisions.mean(),precisions.std()))

F1:		0.7150069282715534	Var:	0.017193085978694195
Recall:		0.6127659574468085	Var:	0.03598198835167607
Precision:	0.862558752774125	Var:	0.03318642093594102


In [16]:
cand_y_pred = best_clf.predict(candset_features)

In [18]:
matching_pairs = candset_df.loc[cand_y_pred == 1, ["id_l", "id_r"]]
matching_pairs = list(map(tuple, matching_pairs.values))

matching_pairs_in_training = training_df.loc[training_label == 1, ["id_l", "id_r"]]
matching_pairs_in_training = set(list(map(tuple, matching_pairs_in_training.values)))

pred_pairs = [pair for pair in matching_pairs if
              pair not in matching_pairs_in_training]  # remove the matching pairs already in training
pred_pairs = np.array(pred_pairs)
pred_df = pd.DataFrame(pred_pairs, columns=["ltable_id", "rtable_id"])
pred_df.to_csv("improved_output.csv", index=False)