In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import nltk
from nltk.corpus import words
from os.path import join
import string
import re
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [None]:

# 1. read data

dfl = pd.read_csv(join('data', "/content/ltable.csv"))
dfr = pd.read_csv(join('data', "/content/rtable.csv"))
train = pd.read_csv(join('data', "/content/train.csv"))

# data preprocessing

dfr.category = dfr.category.replace(np.nan,dfr['category'].value_counts().index[0])
dfr.price = dfr.price.replace(np.nan,np.mean(dfr.price))

dfl.category = dfl.category.replace(np.nan,dfl['category'].value_counts().index[0])
dfl.price = dfl.price.replace(np.nan,np.mean(dfl.price))

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

tlt = []
cat = []
for row in range(len(dfr)):
  sen = dfr.title[row]
  sen = "".join([i.lower() for i in sen if i not in string.punctuation])
  sen = nltk.tokenize.word_tokenize(sen)
  sen = [i for i in sen if i not in stopwords]
  sen = [wn.lemmatize(i) for i in sen]
  sen = [i for i in sen if not re.search(r'\d+',i)]
  sen = " ".join(sen)
  tlt.append(sen)

  sen = dfr.category[row]
  sen = "".join([i.lower() for i in sen if i not in string.punctuation])
  sen = nltk.tokenize.word_tokenize(sen)
  
  sen = [i for i in sen if i not in stopwords]
  sen = [wn.lemmatize(i) for i in sen]
  sen = [i for i in sen if not re.search(r'\d+',i)]
  sen = " ".join(sen)
  cat.append(sen)

dfr.title = tlt
dfr.category = cat
tlt = []
cat = []

for row in range(len(dfl)):
  sen = dfl.title[row]
  sen = "".join([i.lower() for i in sen if i not in string.punctuation])
  sen = nltk.tokenize.word_tokenize(sen)
  sen = [i for i in sen if i not in stopwords]
  sen = [wn.lemmatize(i) for i in sen]
  sen = [i for i in sen if not re.search(r'\d+',i)]
  sen = " ".join(sen)
  tlt.append(sen)

  sen = dfl.category[row]
  sen = "".join([i.lower() for i in sen if i not in string.punctuation])
  sen = nltk.tokenize.word_tokenize(sen)
  
  sen = [i for i in sen if i not in stopwords]
  sen = [wn.lemmatize(i) for i in sen]
  sen = [i for i in sen if not re.search(r'\d+',i)]
  sen = " ".join(sen)
  cat.append(sen)
  print(row/len(dfl),end = '\r')
dfl.title = tlt
dfl.category = cat



# 2. blocking
def pairs2LR(ltable, rtable, candset):
    ltable.index = ltable.id
    rtable.index = rtable.id
    pairs = np.array(candset)
    tpls_l = ltable.loc[pairs[:, 0], :]
    tpls_r = rtable.loc[pairs[:, 1], :]
    tpls_l.columns = [col + "_l" for col in tpls_l.columns]
    tpls_r.columns = [col + "_r" for col in tpls_r.columns]
    tpls_l.reset_index(inplace=True, drop=True)
    tpls_r.reset_index(inplace=True, drop=True)
    LR = pd.concat([tpls_l, tpls_r], axis=1)
    return LR


def block_by_brand(ltable, rtable):
    # ensure brand is str
    ltable['brand'] = ltable['brand'].astype(str)
    rtable['brand'] = rtable['brand'].astype(str)

    # get all brands
    brands_l = set(ltable["brand"].values)
    brands_r = set(rtable["brand"].values)
    brands = brands_l.union(brands_r)

    # map each brand to left ids and right ids
    brand2ids_l = {b.lower(): [] for b in brands}
    brand2ids_r = {b.lower(): [] for b in brands}
    for i, x in ltable.iterrows():
        brand2ids_l[x["brand"].lower()].append(x["id"])
    for i, x in rtable.iterrows():
        brand2ids_r[x["brand"].lower()].append(x["id"])

    # put id pairs that share the same brand in candidate set
    candset = []
    for brd in brands:
        l_ids = brand2ids_l[brd]
        r_ids = brand2ids_r[brd]
        for i in range(len(l_ids)):
            for j in range(len(r_ids)):
                candset.append([l_ids[i], r_ids[j]])
    return candset

# blocking to reduce the number of pairs to be compared
candset = block_by_brand(dfl, dfr)
print("number of pairs originally", dfl.shape[0] * dfr.shape[0])
print("number of pairs after blocking",len(candset))
candset_df = pairs2LR(dfl, dfr, candset)



# 3. Feature engineering

def jaccard_similarity(row, attr):
    x = set(row[attr + "_l"].lower().split())
    y = set(row[attr + "_r"].lower().split())
    return len(x.intersection(y)) / max(len(x), len(y))


def WordMovers_distance(row, attr):
    x = row[attr + "_l"].lower()
    y = row[attr + "_r"].lower()
    return model.wmdistance(x, y)

def feature_engineering(LR):
    LR = LR.astype(str)
    attrs = ["title", "category", "brand", "modelno", "price"]
    features = []
    for attr in attrs:
        j_sim = LR.apply(jaccard_similarity, attr=attr, axis=1)
        l_dist = LR.apply(WordMovers_distance, attr=attr, axis=1)
        features.append(j_sim)
        features.append(l_dist)
    features = np.array(features).T
    return features
candset_features = feature_engineering(candset_df)

# also perform feature engineering to the training set
training_pairs = list(map(tuple, train[["ltable_id", "rtable_id"]].values))
training_df = pairs2LR(dfl, dfr, training_pairs)
training_features = feature_engineering(training_df)
training_label = train.label.values

# 4. Model training and prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
rf = RandomForestClassifier(class_weight="balanced", random_state=0)

for i  in range(len(training_features)):
  for j in range(len(training_features[i])):
    if training_features[i][j] == np.inf:
      training_features[i][j] = -1

X_train, X_test, y_train, y_test = train_test_split(training_features, training_label, test_size=0.2, random_state=0)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

# metrics 
print(f"f1 score : {f1_score(y_test, y_pred, zero_division=1)}")
print(f"recall score : {recall_score(y_test, y_pred, average='macro')}")
print(f"precision score : {precision_score(y_test, y_pred, average='macro')}")

# 5. output
for i  in range(len(candset_features)):
  for j in range(len(candset_features[i])):
    if candset_features[i][j] == np.inf:
      candset_features[i][j] = -1

rf.fit(training_features,training_label)
y_pred = rf.predict(candset_features)
matching_pairs = candset_df.loc[y_pred == 1, ["id_l", "id_r"]]
matching_pairs = list(map(tuple, matching_pairs.values))

matching_pairs_in_training = training_df.loc[training_label == 1, ["id_l", "id_r"]]
matching_pairs_in_training = set(list(map(tuple, matching_pairs_in_training.values)))

pred_pairs = [pair for pair in matching_pairs if
              pair not in matching_pairs_in_training]  # remove the matching pairs already in training
pred_pairs = np.array(pred_pairs)
pred_df = pd.DataFrame(pred_pairs, columns=["ltable_id", "rtable_id"])
pred_df.to_csv("output.csv", index=False)

f1 score : 0.7210884353741497
recall score : 0.7950085718866784
precision score : 0.9377882714693608
