In [1]:
import pandas as pd
import numpy as np

full_outfits = pd.read_parquet("manual_outfits.parquet")
full_outfits.rename(columns = {'outfit_id':'set_id'}, inplace = True)
full_outfits.head()

Unnamed: 0,products,set_id
0,"[15360881, 15379678, 15781925, 16204075, 16260...",0
1,"[13893589, 13893721, 15426616, 16035469, 17173...",1
2,"[13508028, 14161732, 16160567, 17484491, 17503...",2
3,"[16127776, 16756133, 17040752, 18203427, 18205...",3
4,"[14480467, 15487690, 17257765]",4


In [2]:
def np_to_int(column):
    column_list = column.tolist()
    return pd.series([item.item() for item in column_list])

In [3]:
print(type(full_outfits.loc[0, "set_id"]))

<class 'numpy.int64'>


In [4]:
full_outfits["set_id"] = full_outfits.apply(lambda row : np_to_int(row["set_id"]), axis=1)

AttributeError: 'int' object has no attribute 'tolist'

In [5]:
type(full_outfits.loc[0, "set_id"])

numpy.int64

In [6]:
def index_dict(item_list, key_list):
    index_list = [i + 1 for i in range(len(item_list))]
    lists_together = [item for pair in zip(item_list, index_list) for item in pair]
    n = len(lists_together)
    result = [{key_list[0]: lists_together[idx], key_list[1]: lists_together[idx + 1]}
       for idx in range(0, n, 2)]
    return result

In [7]:
key_list = ["item_id", "index"]
full_outfits["items"] = full_outfits.apply(lambda row : index_dict(row["products"], key_list), axis=1)
full_outfits = full_outfits.drop("products", axis=1)
full_outfits.head()

Unnamed: 0,set_id,items
0,0,"[{'item_id': 15360881, 'index': 1}, {'item_id'..."
1,1,"[{'item_id': 13893589, 'index': 1}, {'item_id'..."
2,2,"[{'item_id': 13508028, 'index': 1}, {'item_id'..."
3,3,"[{'item_id': 16127776, 'index': 1}, {'item_id'..."
4,4,"[{'item_id': 14480467, 'index': 1}, {'item_id'..."


In [55]:
print(type(full_outfits.loc[0, "items"][0]['item_id']))

<class 'numpy.int64'>


In [8]:
import numpy as np

train_outfits, validate_outfits, test_outfits = np.split(full_outfits.sample(frac=1, random_state=42), 
                       [int(.8*len(full_outfits)), int(.86*len(full_outfits))])

In [9]:
train_list = train_outfits.to_dict("records")
validate_list = validate_outfits.to_dict("records")
test_list = test_outfits.to_dict("records")

In [10]:
print(train_list[0])
print(type(train_list[0]['items'][0]['item_id']))

{'set_id': 4941, 'items': [{'item_id': 14186436, 'index': 1}, {'item_id': 14380551, 'index': 2}, {'item_id': 16187377, 'index': 3}, {'item_id': 16790471, 'index': 4}]}
<class 'numpy.int64'>


In [19]:
import json

class NumpyEncoder(json.JSONEncoder):
    """ Custom encoder for numpy data types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):

            return int(obj)

        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)

        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
            return {'real': obj.real, 'imag': obj.imag}

        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()

        elif isinstance(obj, (np.bool_)):
            return bool(obj)

        elif isinstance(obj, (np.void)): 
            return None

        return json.JSONEncoder.default(self, obj)

In [18]:
sets = ["train", "validate", "test"]

with open('train_no_dup.json', 'w') as f:
    json.dump(train_list, f, cls=NumpyEncoder)
# for s in sets:
#     with open(f"{s}_no_dup.json", 'w') as f:
#         json.dump(globals()[f"{s}_list"], f, cls='NumpyEncoder')

In [20]:
with open('valid_no_dup.json', 'w') as f:
    json.dump(validate_list, f, cls=NumpyEncoder)

with open('test_no_dup.json', 'w') as f:
    json.dump(test_list, f, cls=NumpyEncoder)