In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from scipy.spatial.distance import cosine
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

## Load Dataset
### Product

In [2]:
product = pd.read_excel('Behold+product+data+04262021.xlsx')
product.head(3)

Unnamed: 0,product_id,brand,brand_category,name,details,created_at,brand_canonical_url,description,brand_description,brand_name,product_active
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,True
2,01EY4Y1BW8VZW51BWG5VZY82XW,Cariuma,Unknown,IBI Slip On Raw Red Knit Sneaker Women,,2021-02-10 02:58:59.591 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,False


### Brand

In [3]:
brand = pd.read_csv('behold_brands USC.csv')
brand.head(3)

Unnamed: 0,brand_id,brand,brand_value,bio,quote,quote_attribute,intro,lifestyle_copy,short_bio,listing_bio
0,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Handmade / Artisan Crafted,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...
1,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Sustainable,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...
2,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Women Owned,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...


### Outfit

In [4]:
outfit = pd.read_csv('outfit_combinations USC.csv')
outfit.head(5)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


### Tags

In [5]:
tags = pd.read_csv('usc_additional_tags USC.csv')
tags.head(3)

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,linenblend
1,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,cottonblend
2,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,modern


### Since part 1 mainly focus on Product and Brand, I combine these two files into one named df. 

In [6]:
df = pd.merge(product, brand, how = 'left', on = ['brand'])
df.head()

Unnamed: 0,product_id,brand,brand_category,name,details,created_at,brand_canonical_url,description,brand_description,brand_name,product_active,brand_id,brand_value,bio,quote,quote_attribute,intro,lifestyle_copy,short_bio,listing_bio
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True,01ETTD5T0F7YTBQ59D9F8M1Y3K,Sustainable,"Two is a modern & minimalist take on caftans, ...",Two emphasizes the importance of sustainable c...,"Monica Patel Cohn, The Designer and Founder","Chic & modern pieces in gorgeous, handwoven fa...","When you think of Two, you feel a certain ener...",The Art of the Sari,Behold Two! Founder Monica Patel-Cohn launched...
1,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True,01ETTD5T0F7YTBQ59D9F8M1Y3K,Handmade / Artisan Crafted,"Two is a modern & minimalist take on caftans, ...",Two emphasizes the importance of sustainable c...,"Monica Patel Cohn, The Designer and Founder","Chic & modern pieces in gorgeous, handwoven fa...","When you think of Two, you feel a certain ener...",The Art of the Sari,Behold Two! Founder Monica Patel-Cohn launched...
2,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True,01ETTD5T0F7YTBQ59D9F8M1Y3K,Women Owned,"Two is a modern & minimalist take on caftans, ...",Two emphasizes the importance of sustainable c...,"Monica Patel Cohn, The Designer and Founder","Chic & modern pieces in gorgeous, handwoven fa...","When you think of Two, you feel a certain ener...",The Art of the Sari,Behold Two! Founder Monica Patel-Cohn launched...
3,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,True,01EFJFYSHRJSC2YVEC8JZ8NSCZ,Women Owned,As well as a ready-to-wear collection; Collina...,"Season after season, our goal remains the same...",Hillary Taymour,Collina Strada is a platform for social issues...,Collina Strada embodies humor and youth. The b...,"Easy to wear clothes, imbued with a fearlessly...",Behold Hillary Taymor! A master of tie dye and...
4,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,True,01EFJFYSHRJSC2YVEC8JZ8NSCZ,Emerging,As well as a ready-to-wear collection; Collina...,"Season after season, our goal remains the same...",Hillary Taymour,Collina Strada is a platform for social issues...,Collina Strada embodies humor and youth. The b...,"Easy to wear clothes, imbued with a fearlessly...",Behold Hillary Taymor! A master of tie dye and...


In [7]:
df.columns

Index(['product_id', 'brand', 'brand_category', 'name', 'details',
       'created_at', 'brand_canonical_url', 'description', 'brand_description',
       'brand_name', 'product_active', 'brand_id', 'brand_value', 'bio',
       'quote', 'quote_attribute', 'intro', 'lifestyle_copy', 'short_bio',
       'listing_bio'],
      dtype='object')

## Preprocessing

In [8]:
nlp = spacy.load('en_core_web_lg',disable=['ner','parser'])


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
def clean_text(text):
    
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation 
    return text

nltk_stopwords = set(stopwords.words('English'))
def clean_stopwords(text, stopwords = nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    text = " ".join(newWords)
    return text

In [None]:
# lemmatization
# reference: https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(lemmatized_sentence)


In [None]:
for i in list(df.columns):
    df[i]=df[i].astype(str)

In [None]:
col_list = ['product_id', 'brand_category', 'name', 'details',
       'description']
for i in col_list:
    df[i] = df[i].apply(clean_text)
    df[i] = df[i].apply(clean_stopwords)
    df[i] = df[i].apply(lambda x: lemmatize_sentence(x))


### Choose top50 brands

In [None]:
top50_brand = df.groupby('brand')['brand'].count().sort_values(ascending = False).head(50)
top50_brand.head(5)

In [None]:
top50_list = top50_brand.index.values
top50_list

In [None]:
df.drop_duplicates(subset='product_id',keep='first',inplace=True)

In [None]:
df.shape

In [None]:
for i in range(len(df)):
    if df.iloc[i, 1] not in top50_list:
        df.iloc[i, 1] = 'Others'
    

In [None]:
df['brand'].unique()

In [None]:
# top 50 brands + Others
df.groupby('brand')['brand'].count().sort_values(ascending = False).head(51)

In [None]:
df['features'] = df['description'] + " " + df['details'] + " " + df['name'] 
df['features'].head(5)

### Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english", 
                             max_features=1000,token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
X = vectorizer.fit_transform(df["features"])



In [None]:
y = df['brand'].astype(str)
encoder = LabelEncoder()
y = to_categorical(encoder.fit_transform(y))

def get_new_labels(y):
    y = LabelEncoder().fit_transform([''.join(str(l)) for l in y])
    return y
y = get_new_labels(y)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X, y)

In [None]:
y_pred = lr.predict(X)

# calculate accuracy
np.mean(y_pred == y)


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.15, random_state = 2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

# accuracy 
np.mean(y_pred == y_test)

#### cross-validation

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(lr, X, y, cv=10,return_train_score=False)
cv_results['test_score']

### Deep leanring with Word Embeding 

In [None]:
features = []
for i in range(len(df)):
    features.append(df['features'].iloc[i])

In [None]:
target_brand = []
for i in range(len(df)):
    target_brand.append(df['brand'].iloc[i])
    

encoder = LabelEncoder()
brand = to_categorical(encoder.fit_transform(target_brand))

def get_new_labels(brand):
    brand = LabelEncoder().fit_transform([''.join(str(l)) for l in brand])
    return brand
brand = get_new_labels(brand)

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(features)
tokenizer.word_index

In [None]:
from typing import List
def integer_encode_documents(features: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    feature_list = []
    for f in features:
        f_integers = []
        for i in text_to_word_sequence(f):
            f_integers.append(tokenizer.word_index[i])
        feature_list.append(f_integers)
    return feature_list

In [None]:
def integer_encode_documents(features, tokenizer):
    return tokenizer.texts_to_sequences(features)

In [None]:
# integer encode the documents
encoded_features = integer_encode_documents(features, tokenizer)
from pprint import pprint
pprint(encoded_features)

In [None]:
from typing import List
def get_max_token_length_per_doc(features: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), features)))

In [None]:
# get the max length in terms of token length
max_length = get_max_token_length_per_doc(features)
max_length

In [None]:
from keras.preprocessing.sequence import pad_sequences
padded_features = pad_sequences(encoded_features, maxlen=max_length, padding='post')
print("Padded features:", padded_features)


In [None]:
padded_features.shape

In [None]:
EMBEDDING_SIZE = 50
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [None]:
vocab_size = 50000
# define the model
# remember, vocab_size = 5000
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 
model.add(Dense(1, activation='sigmoid')) 


In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

Accuracy: 0.797001

In [None]:
model.fit(padded_features, brand, epochs=5, verbose=1)
# evaluate the model`
loss, accuracy = model.evaluate(padded_features, brand, verbose=1)
#print('Accuracy: %f' % (accuracy*100))


### RNN & LSMT

In [None]:
import en_core_web_sm
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_sm.load()

### Tokenize Text

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(features)

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from typing import List

In [None]:
features = []
for i in range(len(df)):
    features.append(df['features'].iloc[i])

In [None]:
from keras.preprocessing.sequence import pad_sequences

def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(features)

# integer encode the documents
encoded_features = integer_encode_documents(features, tokenizer)
padded_docs = pad_sequences(encoded_features, maxlen=max_length, padding='post')


In [None]:
brand = []
for i in range(len(df)):
    brand.append(df['brand'].iloc[i])


In [None]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
brand= to_categorical(encoder.fit_transform(brand))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, brand, test_size=0.2)

###  Import Keras Toolkit

In [None]:
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding

In [None]:
VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

### Load in GloVe Vectors

In [None]:
def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

### Load in Embeddings

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

### Define in Model

In [None]:
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers import Flatten, Masking
# define model

def make_binary_classification_rnn_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(SimpleRNN(units=64, input_shape=(1, max_length)))
    model.add(Dense(64))
    model.add(Dense(51, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model



In [None]:
def make_lstm_classification_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, max_length)))
    model.add(Dense(64))
    model.add(Dense(51, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
model = make_lstm_classification_model()

### Fit the Model

In [None]:
# fit the model
model= model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

In [None]:
model = make_binary_classification_rnn_model()
model= model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))