# Lemon のモジュールを利用して、Malleganモデルを作成する

mellegan model は RandaomForrest

In [1]:
dataset_out_root_dir = "../../data/lemon/datasets"
model_out_root_dir = "../../data/lemon/model/magellan"
dataset_names = [
    "structured_amazon_google",
    "structured_beer",
    "structured_dblp_acm",
    "structured_dblp_google_scholar",
    "structured_fodors_zagat",
    "structured_walmart_amazon",
    "structured_itunes_amazon",
    "dirty_dblp_acm",
    "dirty_dblp_google_scholar",
    "dirty_walmart_amazon",
    "dirty_itunes_amazon",
    "textual_abt_buy",
    "textual_company",
]

In [2]:
# Can not save and load the model created MalleganMatcher in lemon
# So, patching with these codes for saving and loading model file


# Magellan imports Tkinter, but we don't need it for our case
class DummyTkinterFrame:
    ...


class DummyTkinter:
    Frame = DummyTkinterFrame

import sys
sys.modules[
    "Tkinter"
] = DummyTkinter

import py_entitymatching.feature.autofeaturegen
del sys.modules["Tkinter"]


def _get_features_for_type_mod(column_type):
    """
    Get features to be generated for a type
    """
    # First get the look up table
    lookup_table = py_entitymatching.feature.autofeaturegen._get_feat_lkp_tbl()

    # Based on the column type, return the feature functions that should be
    # generated.
    if column_type == "str_eq_1w":
        features = lookup_table["STR_EQ_1W"]
    elif column_type == "str_bt_1w_5w":
        features = lookup_table["STR_BT_1W_5W"]
    elif column_type == "str_bt_5w_10w":
        features = lookup_table["STR_BT_5W_10W"]
    elif column_type == "str_gt_10w":
        features = lookup_table["STR_GT_10W"]
    elif column_type == "numeric":
        features = lookup_table["NUM"]
    elif column_type == "boolean":
        features = lookup_table["BOOL"]
    elif column_type == "un_determined":
        features = lookup_table["UN_DETERMINED"]
    else:
        raise TypeError("Unknown type")
    return features


# Monkey patching the function.
py_entitymatching.feature.autofeaturegen._get_features_for_type = (
    _get_features_for_type_mod
)

In [3]:
import pickle
import pathlib

import lemon.utils.datasets.deepmatcher
import lemon

for dataset_name in dataset_names:
    print(dataset_name)
    load_dataset_func = getattr(lemon.utils.datasets.deepmatcher, dataset_name)
    dataset = load_dataset_func(root=dataset_out_root_dir)
    matcher = lemon.utils.matchers.MagellanMatcher()
    print("training...")
    matcher.fit(
        dataset.train.records.a,
        dataset.train.records.b,
        dataset.train.record_id_pairs,
        dataset.train.labels,
    )
    print("training...done")
    eval_result = matcher.evaluate(
        dataset.test.records.a,
        dataset.test.records.b,
        dataset.test.record_id_pairs,
        dataset.test.labels,
    )
    print(eval_result)
    print("saving...")
    (pathlib.Path(model_out_root_dir) / dataset_name).mkdir(parents=True, exist_ok=True)

    with (pathlib.Path(model_out_root_dir) / dataset_name / "model.pickle").open(
        "wb"
    ) as f:
        pickle.dump(matcher, f)
    with (pathlib.Path(model_out_root_dir) / dataset_name / "eval_result.pickle").open(
        "wb"
    ) as f:
        pickle.dump(eval_result, f)
    print("reload test...")
    with (pathlib.Path(model_out_root_dir) / dataset_name / "model.pickle").open(
        "rb"
    ) as f:
        matcher_reload = pickle.load(f)
    eval_result_reload = matcher_reload.evaluate(
        dataset.test.records.a,
        dataset.test.records.b,
        dataset.test.record_id_pairs,
        dataset.test.labels,
    )
    print(eval_result_reload)

structured_amazon_google
training...
training...done
{'precision': 0.6903225806451613, 'recall': 0.45726495726495725, 'f1': 0.5501285347043702}
saving...
reload test...
{'precision': 0.6903225806451613, 'recall': 0.45726495726495725, 'f1': 0.5501285347043702}
structured_beer
training...
training...done
{'precision': 0.7368421052631579, 'recall': 1.0, 'f1': 0.8484848484848484}
saving...
reload test...
{'precision': 0.7368421052631579, 'recall': 1.0, 'f1': 0.8484848484848484}
structured_dblp_acm
training...
training...done
{'precision': 0.9778761061946902, 'recall': 0.9954954954954955, 'f1': 0.9866071428571428}
saving...
reload test...
{'precision': 0.9778761061946902, 'recall': 0.9954954954954955, 'f1': 0.9866071428571428}
structured_dblp_google_scholar
training...
training...done
{'precision': 0.9343955014058107, 'recall': 0.9317757009345794, 'f1': 0.933083762283575}
saving...
reload test...
{'precision': 0.9343955014058107, 'recall': 0.9317757009345794, 'f1': 0.933083762283575}
struct