In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import sys
import warnings
import json
from tqdm import tqdm

warnings.filterwarnings("ignore")
%matplotlib inline


class TripletMiner:
    def __init__(self, dataframe):
        self.df = dataframe
        self.triplets = []

    def make_triplets(self):
        groups = self.df.groupby("parent")
        for name, group in groups:
            positives = group.to_dict("records")
            negatives = self.df[self.df["parent"] != name]
            negatives = negatives.to_dict("records")
            for i, anchor in enumerate(positives):
                for j in range(i + 1, len(positives)):
                    positive = positives[j]
                    for negative in negatives:
                        if positive["description"] != anchor["description"]:
                            self.triplets.append(
                                (
                                    anchor["description"],
                                    positive["description"],
                                    negative["description"],
                                )
                            )

    def save_triplets(self, filepath):
        random.shuffle(self.triplets)
        with open(filepath, "w") as outfile:
            for entry in self.triplets:
                json.dump(entry, outfile)
                outfile.write("\n")


In [2]:
df = pd.read_csv("data/harmonized-system.csv")
df_lv6 = df[df["level"] == 6]
df_lv6.reset_index(drop=True, inplace=True)

triplet_miner = TripletMiner(df_lv6.head(100))
triplet_miner.make_triplets()
triplet_miner.save_triplets("data/triplets_100.jsonl")