In [19]:
import pandas as pd
import numpy as np
from tensorflow import keras
from collections import defaultdict
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam
import keras.backend as K

In [2]:
# Reading the data
dataset = pd.read_json('cse-258-project/renttherunway_final_data.json.gz', lines=True)
dataset = dataset.dropna()
dataset = dataset.drop(dataset[dataset['rented for']== "party: cocktail"].index)

# 0 = Small, 1 = Fit, 2 = Large
dataset.loc[dataset["fit"] == "small", "fit"] = 0

dataset.loc[dataset["fit"] == "fit", "fit"] = 1

dataset.loc[dataset["fit"] == "large", "fit"] = 2

In [3]:
# Converting the data into list of dictionaries
data = pd.DataFrame(dataset).to_dict('record')

for d in data:
    d['weight'] = int(d['weight'].split('lbs')[0])
    d['height'] = int(d['height'].split(' ')[0][:-1])*12 + int(d['height'].split(' ')[1][:-1])
#     if(int(d['height'].split(' ')[1].split('"')[0])<10):
#         height2 = int(d['height'].split(' ')[1].split('"')[0])*10
#     else:
#         height2 = int(d['height'].split(' ')[1].split('"')[0])
#     d['height'] = int(d['height'][0][0])*100+height2
    
####converting categroies type to one hot
catogeries = ['rented for','body type']
for cat in catogeries:
    categories_list = defaultdict(int)
    for d in data:
        categories_list[d[cat]] += 1
        
    categories_id = defaultdict(int)

    i = 0
    for cID in  categories_list:
        categories_id[cID] = i
        i+=1
    for d in data:
        f = [0]*len(categories_list)
        f[categories_id[d[cat]]] = 1
        d[cat] = f[:len(categories_list)-1]

  data = pd.DataFrame(dataset).to_dict('record')


In [4]:
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [5]:
# import io

# def load_vectors(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     data = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         data[tokens[0]] = map(float, tokens[1:])
#     return data

# embeddings_dict = load_vectors("wiki-news-300d-1M.vec")

In [6]:
def get_word_embeddings(word):
    try:
        return embeddings_dict[word]
    except KeyError:
        return np.zeros(50)

In [7]:
type(embeddings_dict['the'])

numpy.ndarray

In [8]:
# Add features
df = []
for d in data:
    arr = []
#     arr.append(d['weight'])
    arr.append(d['rating'])
    arr += d['rented for']
    arr += list(sum([get_word_embeddings(word) for word in d['review_text'].split()])/len([get_word_embeddings(word) for word in d['review_text'].split()]))
    try:
        arr += list(sum([get_word_embeddings(word) for word in d['review_summary'].split()])/(len([get_word_embeddings(word) for word in d['review_summary'].split()])+1))
    except:
        arr += [0]*50
    
#     arr.append(len(d['review_text']))
    arr += d['body type']
#     arr.append(d['height'])
    arr.append(d['size'])
#     arr.append(d['age'])
    df.append(arr)

In [9]:
y = dataset["fit"]
y_cat = to_categorical(y)
split_ratio = .85
# Split data
test_y = y_cat[int(len(y_cat) * split_ratio):]
train_y = y_cat[:int(len(y_cat) * split_ratio)]
test_f = df[int(len(y_cat) * split_ratio):]
train_f = df[:int(len(y_cat) * split_ratio)]

# Convert to numpy array.
test_f = np.array(test_f)
train_f = np.array(train_f)

In [10]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [11]:
model = Sequential()
model.add(Dense(60, input_shape = (len(train_f[0]), ), activation = "relu"))
model.add(Dense(40, activation = "relu"))
model.add(Dropout(0.1))
model.add(Dense(3, activation = "softmax"))
model.compile(Adam(lr = 0.001), "categorical_crossentropy", metrics = ['acc',f1_m,precision_m, recall_m])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 60)                6960      
_________________________________________________________________
dense_1 (Dense)              (None, 40)                2440      
_________________________________________________________________
dropout (Dropout)            (None, 40)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 123       
Total params: 9,523
Trainable params: 9,523
Non-trainable params: 0
_________________________________________________________________


2021-11-29 08:01:01.510906: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
model.fit(train_f, train_y, verbose=1, epochs=10)

2021-11-29 08:01:01.857587: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19ee913a0>

In [13]:
# Confusion matrix
# y_pred_class = model.predict_classes(test_f)
predict_x=model.predict(test_f) 
y_pred_class=np.argmax(predict_x,axis=1)

In [14]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(test_f)
y_test_class = np.argmax(test_y, axis=1)
confusion_matrix(y_test_class, y_pred_class)

array([[  367,  2517,    90],
       [  204, 15829,   128],
       [  142,  2413,   267]])

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_class, y_pred_class).round(4)

0.7498

In [27]:
from sklearn.metrics import f1_score
f1_score(y_test_class, y_pred_class, average='weighted').round(4)

0.6788

In [28]:
from sklearn.metrics import recall_score
recall_score(y_test_class, y_pred_class, average='weighted').round(4)

0.7498

In [29]:
from sklearn.metrics import precision_score
precision_score(y_test_class, y_pred_class, average='weighted').round(4)

0.7017