In [None]:
import pandas as pd
import numpy as np
import os
import glob
import json
import demjson
import re
import uuid
import mongoengine as mdb
from datetime import datetime

In [None]:
source_path = "C:\source\data\scraped\stadium_goods\original-details.csv"
path_to_save = "C:\source\data\scraped\stadium_goods"
addt_source_path = "C:\source\data\scraped\stadium_goods\categories.csv"

In [None]:
def to_dict(data):
    return [r.to_mongo().to_dict() for r in data]
def to_dataframe(data):
    dicts = to_dict(data)
    fields = list(dicts[0].keys())
    return pd.DataFrame(to_dict(data), columns=fields[1:])

In [None]:
mdb.disconnect()
mdb.connect(
    db="sneakerResaleDB",
    host="mongodb://db.kicksware.com:443",
    username="root",
    password="greenJordans",
    authentication_source="admin",
    ssl=True,
    ssl_ca_certs="/source/certs/mongo/ca.pem"
)


class SneakerBrand(mdb.Document):
    unique_id = mdb.StringField(db_field="uniqueid")
    name = mdb.StringField()
    logo = mdb.StringField()
    hero = mdb.StringField()
    description = mdb.StringField()
    meta = {"collection": "brands"}


class SneakerModel(mdb.Document):
    unique_id = mdb.StringField(db_field="uniqueid")
    name = mdb.StringField()
    hero = mdb.StringField()
    description = mdb.StringField()
    meta = {"collection": "models"}


class SneakerReference(mdb.Document):
    unique_id = mdb.StringField(db_field="uniqueid")
    manufacture_sku = mdb.StringField(db_field="manufacturesku")
    brand_name = mdb.StringField(db_field="brandname")
    model_name = mdb.StringField(db_field="modelname")
    base_model_name = mdb.StringField(db_field="basemodelname")
    brand =  mdb.StringField()
    model =  mdb.StringField()
    basemodel = mdb.StringField()
    description = mdb.StringField()
    release_date = mdb.DateField("releasedate")
    release_strdate = mdb.StringField()
    color = mdb.StringField()
    gender = mdb.StringField()
    nickname = mdb.StringField()
    price = mdb.DecimalField()
    materials = mdb.ListField()
    categories = mdb.ListField()
    image_link = mdb.StringField(db_field="imagelink")
    image_links = mdb.ListField(db_field="imagelinks")
    stadium_url = mdb.StringField(db_field="stadiumurl")
    meta = { "collection": "references", 'strict': False }

In [None]:
df = to_dataframe(SneakerReference.objects())
print(f"Initial: {len(df)}")
pd.set_option("max_rows", 200)

In [None]:
gk = df.groupby("uniqueid")
df = gk.first()
print(f"Distinct: {len(df)}")

In [None]:
pd.unique(df["brand"].values)

Filter data:

In [None]:
keywords_query = '''Pant|Pants|Coat|Shirt|Reverse|Jacket|Crew|Tee|Pullover|Strapback|Hat|Short|Bottle|Sock|Socks|Hoodie|Beanie|Bomber|Sleeveless|Sweatpants|Sleeve|Sweatshirt|Overshirt|Bandana|Bearbrick|Bag|Belt|Poncho|Parka|Case|Bodysuit|Sunglasses|Glasses|Dress|Stitch|Jeans|Leggings|Lock|Luggage|Gilet|Wipes|Essential Kit|Sneaker Box|Wallet|Chain|Stone Island|Swim|Pullove|Famous|Pablo|Romantic|Jersey Top|Tank Top|Track Top|Anorak|Joggi|Zimmermann|Cubs|Cushion|Flower|Football|T-Sh|Kimono|Astroworld|Pajama|Pocket|Knife|Organizer|Pocket|Purse|Look|Basebal|Bodysuit|Crew|Crewneck|Shoulder|Sweater|Camper|Camera|Holder|Card|Strapback|Denim|Body|Hands|Zippy|Script|Swea|Embroidered|Heavyweigh|Neck|Pillow|Hooded|Striped|Cotton|HAT|Button|Manifestation|Collectible|Real Men|Pullover|Balaclava|Skateboard|Popsockets|Polo|Head|Full Zip|Hoo|HOODIE|JACKET'''
brand_query = [
    "Ewing", "Jordan", "Nike", "Adidas", "Reebok", "Asics", "Puma",
    "New Balance", "Fila", "Converse", "Vans", "Diadora",
    "Saucony", "Under Armour", "Timberland", "Clarks",
    "Bape", "GREATS", "Diamond Supply", "KARHU", "Revenge X Storm",
    "Information Technology", "Ice Cream", "Footwear", "Yeezy", "Kryptonite",
    "Dr Martens", "Q4 Sports", "Off-White", "Kanye West",
    "Anti Social Social Club", "GOLDEN GOOSE", "Fear Of God"
]

In [None]:
print(len(df))
qdf = df[df["brandname"].isin(brand_query)]
print(len(qdf))

#gdf["brand"] = gdf["brand"].map({"Footwear": "Nike"}).fillna(gdf["brand"]) #todo
pd.unique(qdf["brand"].values)
qdf

In [None]:
qdf = qdf[~qdf["modelname"].str.contains(keywords_query)]
print(len(qdf))
qdf # filtered by keywords

In [None]:
# handle NaN values
qdf = qdf.replace(np.nan, '', regex=True)
len(qdf)

Data transformation:

By catgories:

In [None]:
def camel_case_split(s):
    idx = list(map(str.isupper, s))
    l = [0]
    for (i, (x, y)) in enumerate(zip(idx, idx[1:])):
        if x and not y:
            l.append(i)
        elif not x and y:
            l.append(i+1)
    l.append(len(s))
    return [s[x:y] for x, y in zip(l, l[1:]) if x < y]
    
def get_acronym(source):
    return "".join(filter(str.isupper, source.title()))

def determine_base_model(row):
    model = row["modelname"]
    brand = row["brandname"]
    categories = row["categories"]
    if not categories:
        return None
    distances = {}
    for category in categories:
        distance = sum([word.lower() in model.lower() for word in camel_case_split(category)])
        if not distance:
                distance = 1 if get_acronym(category) in model else 0
        if distance:
            distances[category] = distance
    
    base_model = max(distances or [None], key=distances.get)
    return base_model

In [None]:
df["basemodelname"] = df.apply(determine_base_model, axis=1)
df

By JSON base model list:

In [None]:
kdf = pd.read_json("../meta/base-model-tags.json")
base_models = kdf[0].tolist()
for i, model in enumerate(base_models): # validity check
    if any([model in lower for lower in base_models[i+1:]]):
        print(model)

In [None]:
s = df.modelname.str.len().sort_values(ascending=False).index
sdf = df.reindex(s)
sequence = list(sdf.T.to_dict().values())
groups = {}
for model in base_models:
    child_models = [item for item in sequence if model.upper() in item["modelname"].upper()]
    [sequence.remove(item) for item in child_models]
    groups[model] = child_models
group_analit = { key: len(val) for key, val in groups.items() }

In [None]:
sequence = list()
for key, items in groups.items():
    sequence.extend([dict(item, basemodel=key) for item in items])
gdf = pd.DataFrame(sequence)
gdf

Replace on better base model names:

In [None]:
with open("../meta/base-model-map.json", "r") as stream:
    model_map = json.load(stream)

gdf["basemodel"] = gdf["basemodel"].map(model_map).fillna(gdf["basemodel"])
pd.unique(gdf["basemodel"].values)

Generate brands and models data frames:

In [None]:
brands_set = set(pd.unique(rdf["brand"].values))
all_brands = set(pd.read_json("../meta/brands.json")[0].T.to_dict().values())
brands_list = list(brands_set.union(all_brands))
brands_list