In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import load_model

# PROJECT GOALS:
''' Create a neural network that runs a regression analysis on prices of different clothings items from different brands.
    The network will, given an item and the brand, recommend a price based on other user's listings.
    Add-ons:
    - weigh 'sold' items, 'reserved' items, and 'unsold' items differently'''

# load in data
vestiaire_df = pd.read_csv('/content/drive/MyDrive/Vestiaire Model/vestiaire.csv', low_memory=False)

# clean and tokenize data
target_df = vestiaire_df[['product_type', 'product_description', 'product_material', 'product_gender_target', 'brand_name', 'price_usd', 'brand_id', 'product_condition']]

In [None]:
clothing_dict = {
    'shirt': ['shirt', 'tshirt', 't-shirt', 'tank top', 'top', 'tee', 'blouse', 'polo', 'vest', 'tunic',  'twin-set', 'camisole'],
    'sweater': ['sweater', 'jumper', 'cardigan', 'knitwear', 'pull'],
    'skirt': ['skirt', 'pareo'],
    'shorts': ['shorts', 'short', 'bermuda'],
    'pants': ['pants', 'jeans', 'jean', 'trousers', 'slacks', 'leggings', 'harem'],
    'jacket': ['jacket', 'coat', 'blazer', 'windbreaker', 'parka', 'puffer', 'cape', 'poncho', 'caban', 'trench'],
    'dress': ['dress', 'gown', 'sundress'],
    'jumpsuit': ['jumpsuit', 'overall'],
    'undergarment': ['undergarment', 'bra', 'lingerie', 'slip', 'bustier', 'corset', 'tight', 'string'],
    'swimwear': ['swimwear', 'swimsuit'],
    'suit': ['suit'],
    'shoes': ['sneakers', 'boots', 'heels', 'heel', 'sandals', 'sandal', 'trainers', 'flats', 'flat', 'mules', 'espadrilles', 'flip flops', 'lace ups', 'escarpins a paillettes', ' leather'],
    'accessories': ['accessories', 'accessory', 'jewelry', 'jewellery', 'hat', 'gloves', 'watch', 'belt', 'hankerchief', 'cuff links', 'tie', 'sunglasses', 'cufflinks', 'scarf', 'pocket square', 'wallet', 'ring', 'purse', 'clutch', 'neckerchief', 'beanie', 'stole', 'iphone case', 'ipad', 'glasses', 'handkerchief', 'mittens', 'cheche', 'cap', 'beret', 'choker', 'panama', 'diary', 'lifestyle', 'page/nom_singulier/31']
}

def map_clothing(typ, dic):
  for key, vals in dic.items():
    if any(val in typ for val in vals):
      return key
  return typ

def standardize_data(df):
  # work on copy of the original dataframe
  df = df.copy()

  # make every str column lowercase
  for col in df.columns:
      if df[col].dtype == 'object':
          df.loc[:, col] = df[col].str.lower()

  # apply clothing dictionary
  df['clothing_cat'] = df['product_type'].apply(map_clothing, args=(clothing_dict,))

  # fix any nan in gender/material category
  df['product_gender_target'].fillna('Unisex', inplace=True) # from data there arent many/any nan, safe to possibly mislabel some as unisex instead of trying to find gender from description
  df['product_material'].fillna('not specified', inplace=True)

  # drop any rows with no price
  df.dropna(subset=['price_usd'], inplace=True)

  # encode categorical data
  cat_encoder = LabelEncoder()
  cond_encoder = LabelEncoder()
  gen_encoder = LabelEncoder()
  brand_encoder = LabelEncoder()
  material_encoder = LabelEncoder()

  # create encoders for each variable - separate step so i can use encoders later
  cat_encoder.fit(df['clothing_cat'].unique())
  cond_encoder.fit(df['product_condition'].unique())
  gen_encoder.fit(df['product_gender_target'].unique())
  brand_encoder.fit(df['brand_name'].unique())
  material_encoder.fit(df['product_material'].unique())

  # encode variables
  df['clothing_cat_encoded'] = cat_encoder.transform(df['clothing_cat'])
  df['condition_encoded'] = cond_encoder.transform(df['product_condition'])
  df['gender_encoded'] = gen_encoder.transform(df['product_gender_target'])
  df['brand_encoded'] = brand_encoder.transform(df['brand_name'])
  df['material_encoded'] = material_encoder.transform(df['product_material'])

  return df, cat_encoder, cond_encoder, gen_encoder, brand_encoder, material_encoder

standardized_df, cat_encoder, cond_encoder, gen_encoder, brand_encoder, material_encoder = standardize_data(target_df)

# Split the data into training and testing sets
X = standardized_df[['clothing_cat_encoded', 'brand_encoded', 'condition_encoded', 'gender_encoded', 'material_encoded']]
y = standardized_df['price_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# making the model
num_features = X_train.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.Dense(num_features, activation='relu'),
    tf.keras.layers.Dense(264, activation='relu'),
    tf.keras.layers.Dense(132, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5), # add dropout layers to avoid overfitting
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='linear')
])

# scale features to help with learning
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning rate scheduler to improve performance
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics=['mean_absolute_error']) # use absolute error to reduce sensititivy to outliers

# train the model
model.fit(X_train_scaled, y_train, epochs=25, batch_size=16, verbose=1, validation_split=0.2, callbacks=[reduce_lr])

# test the model
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


[11500358656.0, 90690.34375]

In [None]:
# and save it!
model.save('/content/drive/MyDrive/Vestiaire Model/reluModel')

# and load it back in again if needed
#price_predictor = load_model('/content/drive/MyDrive/Vestiaire Model/reluModel')

In [None]:
# use the model! still working on it

# 'clothing_cat_encoded', 'brand_encoded', 'condition_encoded', 'gender_encoded', 'material_encoded'
def predict_price(category: str, brand: str, condition: str, gender: str, material: str, model):
  ''' condition: 'Never worn', 'Very good condition', 'Never worn, with tag',
       'Good condition', 'Fair condition'
      gender: 'Men', 'Women', 'Unisex'
      brand : .........
      category: 'skirt', 'jacket', 'pants', 'dress', 'shirt', 'swimwear', 'shorts',
       'undergarment', 'sweater', 'jumpsuit', 'accessories', 'suit',
       'shoes'
    '''

  #  1) encode all inputs for model
  label_encoder = LabelEncoder()
  cond_encoded, gen_encoded, brand_encoded, cat_encoded, mat_encoded = cond_encoder.transform([condition.lower()])[0], gen_encoder.transform([gender.lower()])[0], brand_encoder.transform([brand.lower()])[0], cat_encoder.transform([category.lower()])[0], material_encoder.transform([material.lower()])[0]

  # 2) package encoded inputs together to feed into model
  input_array = np.array([cat_encoded, brand_encoded, cond_encoded, gen_encoded, mat_encoded])

  # 3) predict the price!
  price = model.predict(input_array.reshape(1, -1))

  return price

predict_price('shoes', 'gucci', 'good condition', 'women', 'leather', model)



array([[74066.32]], dtype=float32)