## Properties

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle

In [3]:
properties = pd.concat([pd.read_csv("data/item_properties_part1.csv"), pd.read_csv("data/item_properties_part2.csv")])
properties = properties.sort_values("timestamp")
properties[:10]

Unnamed: 0,timestamp,itemid,property,value
5903679,1431226800000,317951,790,n32880.000
5668945,1431226800000,422842,480,1133979
314220,1431226800000,310185,776,103591
4170323,1431226800000,110973,112,679677
4170324,1431226800000,179597,available,0
8025843,1431226800000,260136,available,1
314210,1431226800000,138592,764,1285872
5668941,1431226800000,216269,364,336749
5668940,1431226800000,299944,764,1285872
5668939,1431226800000,146103,112,679677


In [4]:
categories = properties[
    (properties["property"] == "categoryid")
]["value"].unique()

In [5]:
items_in_categories = {
    category: set() for category in categories
}
for index, row in properties[properties["property"] == "categoryid"].iterrows():
    items_in_categories[row["value"]].add(row["itemid"])

In [6]:
len(properties["property"].unique())

1104

There are 1104 properties in the dataset, so I will split dataset into categories in order to reduce number of properties of each item.

In [7]:
file_path = "category_properties"
try:
    open(file_path, "x")
except FileExistsError:
    with open(file_path, "rb") as file:
        category_properties = pickle.load(file)
else:
    category_properties = {}
    for category, items in items_in_categories.items():
        category_properties[category] = properties[
            properties["itemid"].isin(items)
        ]["property"].unique()

    with open(file_path, "wb") as file:
        pickle.dump(category_properties, file)

category_properties["181"]

array(['categoryid', '790', '112', '364', '6', '159', '283', '448', '678',
       'available', '888', '764', '761', '172', '1036', '97', '202',
       '1054', '120', '698', '561', '595', '227', '963', '839', '892'],
      dtype=object)

`category_properties` contains array of properties for each category.

In [8]:
properties_sets = [set(prop_list) for _, prop_list in category_properties.items() if len(prop_list) > 0]
common_properties = set.intersection(*properties_sets)
common_properties

{'112',
 '159',
 '283',
 '364',
 '678',
 '764',
 '790',
 '888',
 'available',
 'categoryid'}

The properties above have values in every category.

In [9]:
# file_path = "items"
# try:
#     open(file_path, "x")
# except FileExistsError:
#     with open(file_path, "rb") as file:
#         items = pickle.load(file)
# else:
#     items = {
#         item: {} for item in properties["itemid"].unique()
#     }
#     for index, row in properties.iterrows():
#         items[row["itemid"]][row["property"]] = row["value"]
#     with open(file_path, "wb") as file:
#         pickle.dump(items, file)


`item` contains properties' names and their values for every item in the dataset.

In [10]:
try:
    os.mkdir("categories")
except FileExistsError:
    pass

try:
    open("categories/0.csv", "x")
except FileExistsError:
    pass
else:
    for category, cat_props in category_properties.items():
        data = {
            property: [] for property in cat_props
        }
        for item in items_in_categories[category]:
            for property in cat_props:
                data[property].append(items[item].get(property, None))
        pd.DataFrame(data).to_csv(f"categories/{category}.csv")


The code snippet above generates `.csv` files describing items of each category. Only properties relevant for such a category are used. The dataframe is displayed below.

In [29]:
pd.read_csv("categories/0.csv")[:5]

Unnamed: 0.1,Unnamed: 0,categoryid,159,888,776,764,917,available,678,364,...,325,202,476,550,713,810,928,206,1097,674
0,0,0,519769,34084 906385 n126.000 1175087 n318.000 1175087...,98587.0,1285872,n863064.000,0,479758 1021001,691063,...,,34084 906385,,769062.0,,n126.000 1175087 n318.000 1175087 n270.000 424566,769062,,,
1,1,0,519769,342856 357845 784581 1297729 n12.000 908104,570014.0,1285872,1198079,0,522723,1035579,...,,342856 357845,,769062.0,784581.0,769062,769062,,,
2,2,0,519769,981066 784581 1297729 n72.000 309206,554384.0,1285872,n48127524.000,1,175375,359636,...,,981066,,769062.0,784581.0,769062,769062,,n48.000 655992,
3,3,0,519769,551560 237874 971646 n228.000 1175087 n144.000...,1135782.0,1285872,n26748.000,0,219437,857861,...,,551560 237874 971646,,769062.0,370498.0,n228.000 1175087 n144.000 1175087 n96.000 424566,769062,,,
4,4,0,519769,581542 370498 1297729 n12.000 908104,906315.0,1285872,n350760.000,0,449783 753764,377300,...,,581542,,769062.0,370498.0,769062,769062,,,


### Property data types

- numerical (fields containing prefix `n`)
- textual (stemmed and hashed)
- optional (some items have no value)
- categorical (or just one-word textual?)

In [12]:
item_properties = properties["property"].unique()
numerical = {
    property: True for property in item_properties
}
optional = {
    property: False for property in item_properties
}

for category, _ in category_properties.items():
    df = pd.read_csv(f"categories/{category}.csv")
    for column in df.columns[1:]:
        if column == "itemid":
            continue
        numerical[column] &= (df[column].dtype == object) and df[column].str.startswith("n").all()
        optional[column] |= pd.isna(df[column]).any()


  df = pd.read_csv(f"categories/{category}.csv")
  df = pd.read_csv(f"categories/{category}.csv")


`numerical` and `optional` are dictionaries containing (`property`, `value`) pairs, wehere `value` is `True` if corresponding `property` is a numerical, resp. optional. property.

In [13]:
sum(numerical.values())

71

There are 71 numerical properties in the dataset.

In [15]:
sum(optional.values())

1033

There are 1033 optional properties in the dataset.

In [14]:
props = set(item_properties)
numerical_props = set([category for category, result in numerical.items() if result])
optional_props = set([category for category, result in optional.items() if result])

In [16]:
print(numerical_props)

{'282', '674', '203', '472', '1075', '762', '650', '703', '870', '246', '1100', '279', '288', '482', '1005', '700', '221', '1018', '1063', '389', '874', '900', '211', '261', '626', '532', '4', '239', '923', '881', '790', '367', '778', '75', '943', '622', '1091', '218', '1039', '424', '598', '677', '383', '641', '967', '1072', '244', '568', '699', '615', '18', '633', '255', '877', '782', '861', '37', '443', '495', '241', '191', '162', '791', '299', '150', '1082', '331', '685', '878', '289', '381'}


In [17]:
print(props - optional_props)

{'70', '665', '744', 'categoryid', '179', '283', '517', '902', '1009', '81', '47', '1030', '977', '318', '634', '1046', '91', '394', '740', '159', '599', '144', '522', '261', '1078', '417', '242', '764', '1019', '57', '112', '587', '198', '923', '60', '17', '364', '1014', '790', '881', '644', '424', '1091', '557', '396', '407', 'available', '514', '1072', '244', '890', '488', '358', '1083', '1093', '888', '498', '27', '754', '196', '439', '1062', '774', '490', '368', '69', '828', '834', '442', '624', '685'}


In [18]:
print(set.intersection(numerical_props, props - optional_props))

{'424', '1091', '261', '1072', '244', '923', '685', '881', '790'}


In [19]:
print(set.intersection(numerical_props, optional_props))

{'282', '203', '674', '1075', '762', '650', '703', '870', '1100', '246', '288', '279', '1005', '482', '700', '221', '1018', '1063', '389', '874', '900', '211', '626', '532', '4', '239', '367', '778', '75', '943', '622', '1039', '218', '598', '677', '878', '383', '641', '967', '568', '699', '615', '18', '633', '255', '877', '782', '861', '37', '495', '241', '191', '162', '791', '299', '150', '1082', '331', '472', '443', '289', '381'}


In [20]:
cat_num_props = {}
for category, cat_props in category_properties.items():
    cat_num_props[category] = set.intersection(set(cat_props), numerical_props)

In [21]:
for cat, num_props in list(sorted(cat_num_props.items(), key=lambda x: -len(x[1])))[:5]:
    print(f"Category {cat:>4} contains {len(num_props)} numerical properties: {num_props}")

Category 1305 contains 9 numerical properties: {'778', '282', '877', '1063', '443', '241', '641', '699', '790'}
Category 1403 contains 7 numerical properties: {'782', '598', '383', '331', '699', '790', '367'}
Category  985 contains 7 numerical properties: {'381', '203', '861', '677', '1100', '790', '482'}
Category 1554 contains 7 numerical properties: {'18', '255', '37', '211', '472', '289', '790'}
Category  522 contains 7 numerical properties: {'700', '633', '1039', '261', '4', '1072', '790'}


The categories above contain the most numerical properties among all categories.

In [30]:
print(set.intersection(props - optional_props, props - numerical_props))

{'70', '665', '744', 'categoryid', '179', '283', '517', '902', '1009', '81', '47', '1030', '318', '634', '1046', '91', '394', '740', '159', '599', '144', '522', '1078', '417', '242', '764', '1019', '57', '112', '587', '198', '624', '60', '17', '364', '1014', '644', '396', '557', '407', 'available', '514', '488', '890', '358', '1083', '1093', '888', '498', '27', '754', '196', '439', '1062', '774', '490', '368', '69', '828', '834', '442', '977'}


In [31]:
print(set.intersection(props - optional_props, props - numerical_props, common_properties))

{'categoryid', '159', 'available', '283', '764', '112', '364', '888'}


The properties above are non-optional, non-numerical properties, that have values in all categories.

In [32]:
common_textual_properties = set.intersection(props - optional_props, props - numerical_props, common_properties)
lengths = {}
for prop in common_textual_properties:
    if prop in {"categoryid", "available"}:
        continue
    lengths[prop] = properties[
        (properties["property"] == prop)
    ]["value"].apply(len).mean()

In [36]:
dict(sorted(lengths.items(), key=lambda x: -x[1]))

{'283': 131.89874945390088,
 '888': 34.985753223405695,
 '764': 7.0,
 '364': 6.170133015450611,
 '159': 6.0,
 '112': 6.0,
 'categoryid': 3.4388655872643725,
 'available': 1.0}

The textual property with the longest average value is property `283`, which could represent the item's description.

The second longest property is property `888`. 35 characters is approximately 5 words (a word is typically encoded 6- or 7-digit hash number), this could be the item's name, because the following property contains only one word on average.