In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/list-product-1/list_products.json
/kaggle/input/product-pairs-final/negative_examples_40000.npz
/kaggle/input/product-pairs-final/positive_examples_40000.npz
/kaggle/input/embedding-vectors-40k/product_name_embedding_vector_40000.npz
/kaggle/input/embedding-vectors-40k/image_embedding_vector_40000.npz
/kaggle/input/product-pair-examples/negative_examples.npz
/kaggle/input/product-pair-examples/positive_examples.npz


In [42]:
import random
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from statistics import mean, stdev
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import RandomizedSearchCV

In [31]:
df = pd.read_json("/kaggle/input/list-product-1/list_products.json", lines = True)

In [32]:
data_images = np.load('/kaggle/input/embedding-vectors-40k/image_embedding_vector_40000.npz')
data_product_names = np.load('/kaggle/input/embedding-vectors-40k/product_name_embedding_vector_40000.npz')

vectorized_product_images = data_images["vectorized_product_images"]
vectorized_product_names = data_product_names["vectorized_product_names"]

cosine_similarity_product_names = cosine_similarity(vectorized_product_names)
cosine_similarity_product_images = cosine_similarity(vectorized_product_images)

In [36]:
data_negative_examples = np.load('/kaggle/input/product-pair-examples/negative_examples.npz')
data_positive_examples = np.load('/kaggle/input/product-pair-examples/positive_examples.npz')

negative_examples = data_negative_examples["negative_examples"]
positive_examples = data_positive_examples["positive_examples"]

examples = np.concatenate((negative_examples, positive_examples), axis=0)

In [None]:
X = np.empty((0, 3), float)
y = np.empty((0), int)
i = 0
for index, example in enumerate(examples):
    x1 = cosine_similarity_product_names[example[0]][example[1]]
    x2 = cosine_similarity_product_images[example[0]][example[1]]
    
    x3 = abs(df.at[example[0], "price"] - df.at[example[1], "price"])
    if math.isnan(x3):
        printf(df.at[example[0], "price"], df.at[example[1], "price"], df.at[example[0], "price"] - df.at[example[1], "price"])
    X = np.append(X, [[x1, x2, x3]], axis=0)
    y = np.append(y, [1 if df.at[example[0], "group_id"] == df.at[example[1], "group_id"] else 0], axis=0)

In [81]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
param_grid = { 
    'n_estimators': [100, 150, 200], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
} 

In [82]:
rf_RandomGrid = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                   param_distributions = param_grid, 
                                   cv = skf, 
                                   scoring= ["accuracy", "precision", "recall", "f1"],
                                   refit="f1",
                                   verbose=2, 
                                   n_jobs = -1,
                                   n_iter= 10)
rf_RandomGrid.fit(X, y)
print(rf_RandomGrid.best_params_)
print(rf_RandomGrid.cv_results_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30, 'criterion': 'gini'}
{'mean_fit_time': array([18.06082635, 26.16451588, 38.81522131, 26.64969435, 72.52552962,
       34.08457518, 33.91552696, 22.01368704, 44.26650496, 42.04866571]), 'std_fit_time': array([0.3186715 , 0.36993262, 0.53590314, 0.12468152, 0.56717219,
       0.26159607, 1.48779046, 0.80970776, 1.15487557, 5.76806563]), 'mean_score_time': array([0.31282496, 0.49241657, 0.2911211 , 0.50690141, 0.50307002,
       0.62212362, 0.60839033, 0.412216  , 0.73314395, 0.29150872]), 'std_score_time': array([0.01078446, 0.02654753, 0.01098864, 0.07500756, 0.06265688,
       0.02012435, 0.05224268, 0.06285697, 0.05813717, 0.04165815]), 'param_n_estimators': masked_array(data=[100, 150, 100, 150, 200, 200, 200, 100, 200, 100],
             mask=[False, False, False, False, False, False, False, False,
                

In [83]:
best_params = rf_RandomGrid.best_params_
best_index = rf_RandomGrid.best_index_
print("Best params =", best_params)
print("Accurary =", rf_RandomGrid.cv_results_['mean_test_accuracy'][best_index])
print("Precision =", rf_RandomGrid.cv_results_['mean_test_precision'][best_index])
print("Recall =", rf_RandomGrid.cv_results_['mean_test_recall'][best_index])
print("F1 =", rf_RandomGrid.cv_results_['mean_test_f1'][best_index])

Best params = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30, 'criterion': 'gini'}
Accurary = 0.9833340302287013
Precision = 0.9851911794383961
Recall = 0.9813874935251106
F1 = 0.983284075826553
[CV] END criterion=entropy, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=  18.7s
[CV] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=150; total time=  26.6s
[CV] END criterion=entropy, max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=  38.3s
[CV] END criterion=entropy, max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=  39.1s
[CV] END criterion=entropy, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=150; total time=  27.0s
[CV] END criterion=entropy, max_depth=10, max_feat

In [None]:

rf = RandomForestClassifier(n_estimators = best_params["n_estimators"],
                            max_depth = best_params["max_depth"],
                            max_features = best_params["max_features"],
                            min_samples_split = best_params["min_samples_split"],
                            min_samples_leaf = best_params["min_samples_leaf"],
                            criterion = best_params["criterion"]
                           )

In [20]:
import pickle
rf.fit(X, y)
with open('product_matching_model.pkl','wb') as f:
    pickle.dump(clf,f)