In [2]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
from sentence_transformers import util, losses
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
fine_tuning_file_dir = './data/computers_train/computers_train_large.json.gz'
ft_valid_file_dir = './data/computers_valid/computers_valid_large.csv'
n_epochs = 1

In [11]:
# finetuning bert model
large_df = pd.read_json(fine_tuning_file_dir, compression='gzip', lines=True)
large_df = large_df.drop(columns=['id_left', 'id_right', 'cluster_id_left', 'cluster_id_right', 'identifiers_left', 'identifiers_right'])
valid_ids = pd.read_csv(ft_valid_file_dir)
ft_train_df = large_df.loc[~large_df['pair_id'].isin(valid_ids['pair_id'])]

In [13]:
data_df = pd.read_json('./data/computers_train/computers_train_medium.json.gz', compression='gzip', lines=True)
data_df = data_df.drop(columns=['id_left', 'id_right', 'cluster_id_left', 'cluster_id_right', 'identifiers_left', 'identifiers_right'])

valid_ids = pd.read_csv('./data/computers_valid/computers_valid_medium.csv')

train_df = data_df.loc[~data_df['pair_id'].isin(valid_ids['pair_id'])]
valid_df = data_df.loc[data_df['pair_id'].isin(valid_ids['pair_id'])]

In [15]:


def get_similarity_vectorizer(value_left, value_right, vectorizer):
    if value_left == None or value_right == None:
        return None

    if value_left == value_right:
        return 1.0
    tf_idf_left = vectorizer.transform([value_left])
    tf_idf_right = vectorizer.transform([value_right])
    sim = cosine_similarity(tf_idf_left, tf_idf_right)[0][0]
    return sim


def compute_features(df):
    # TODO: title, brand and keyValue using Tfidf / encode all sentences at once to sbert
    num_df = pd.DataFrame()
    text_df = pd.concat([df['title_left'], df['title_right']])
    vectorizer = TfidfVectorizer()
    vectorizer = vectorizer.fit(text_df.values)
    num_df['title_sim'] = df.apply(
        lambda x: get_similarity_vectorizer(x['title_left'], x['title_right'], vectorizer), axis=1)
    print("title done")

    text_df = pd.concat([df['description_left'], df['description_right']])
    text_df = text_df.dropna(how="any", axis=0)
    vectorizer = TfidfVectorizer()
    vectorizer = vectorizer.fit(text_df.values)
    num_df['description_sim'] = df.apply(
        lambda x: get_similarity_vectorizer(x['description_left'], x['description_right'], vectorizer), axis=1)
    print("descrition done")

    text_df = pd.concat([df['brand_left'], df['brand_right']])
    text_df = text_df.dropna(how="any", axis=0)
    vectorizer = TfidfVectorizer()
    vectorizer = vectorizer.fit(text_df.values)
    num_df['brand_sim'] = df.apply(lambda x: get_similarity_vectorizer(x['brand_left'], x['brand_right'], vectorizer), axis=1)
    print("brand done")

    num_df['label'] = df['label']
    return num_df


In [16]:
num_train_df = compute_features(train_df)

title done
descrition done
brand done


In [17]:
num_valid_df = compute_features(valid_df)

title done
descrition done
brand done


In [18]:
num_train_df

Unnamed: 0,title_sim,description_sim,brand_sim,label
3,0.574667,,,1
4,0.586431,0.039588,1.0,1
5,0.641244,0.140730,,1
6,0.662503,,,1
7,0.212899,,,1
...,...,...,...,...
8089,0.144609,0.109454,0.0,0
8090,0.000000,0.000000,0.0,0
8091,0.295581,0.361471,1.0,0
8092,0.343734,0.207120,1.0,0


In [20]:
#importing dataset
#isolate the x and y variables
y_train = np.array(num_train_df['label'])
X_train = num_train_df.drop(columns=["label"])
    
y_test = np.array(num_valid_df['label'])
X_test = num_valid_df.drop(columns=["label"])


In [22]:
len(num_train_df) + len(num_valid_df)

8094

In [21]:

#create xgboost matrices
Train = xgb.DMatrix(X_train, label = y_train)
Test = xgb.DMatrix(X_test, label = y_test)

#set the parameters
parameters = {'learning_rate': 0.3,
               'max_depth': 2,
               'colsample_bytree': 1,
               'subsample': 1,
               'min_child_weight': 1,
               'gamma': 0, 
               'random_state': 1500,
               'eval_metric': "auc",
               'objective': "binary:logistic"}


model = xgb.train(params = parameters,
                dtrain = Train,
                num_boost_round = 200,
                evals = [(Test, "Yes")],
                verbose_eval = 50)


#PRedictions
pred = model.predict(Test)
pred = np.where(pred > 0.5, 1, 0)

# confusion_matrix = confusion_matrix(y_test, pred)
report = classification_report(y_test, pred)
print(report)


[0]	Yes-auc:0.87963
[50]	Yes-auc:0.91091
[100]	Yes-auc:0.91172
[150]	Yes-auc:0.90917
[199]	Yes-auc:0.90705
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1267
           1       0.77      0.67      0.72       352

    accuracy                           0.88      1619
   macro avg       0.84      0.81      0.82      1619
weighted avg       0.88      0.88      0.88      1619

