In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
import csv
from sklearn.neighbors import KDTree, NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

%matplotlib inline

## Get Training Inputs Model

In [21]:
def map_category_to_label():
    clothes_categories = ['dresses', 'handbags', 'hats', 'jackets', 'jeans', 
                      'jewelry', 'shorts', 'skirts', 'sunglasses', 'sweaters', 
                      'sweatshirts', 'womens-outerwear', 'womens-pants', 'womens-shoes', 'womens-tops']
    labels = range(0, len(clothes_categories))
    return dict(zip(clothes_categories, labels)), dict(zip(labels, clothes_categories))

def make_data_inputs(product_file, category_mapping, prod_feat_mapping):
    X_product_features = []
    y_label = []
    print('Reading file ', product_file)
    with open(product_file, 'r') as f:
        for i, line in enumerate(f.readlines()[:]):
            try:
                l = line.split('\t')
                product_id = l[0].strip()
                prod_feats = prod_feat_mapping[product_id]
                product_category_text = l[2].strip()
                product_label = category_mapping[product_category_text]
                y_label.append(product_label)
                X_product_features.append(prod_feats[0])
            except Exception as e:
                print(e)
    return np.array(X_product_features), np.array(y_label)

def get_product_to_features():
    product_feats_file = 'data-outfits/outfit_product_features.tsv'
    product_to_feats = {}
    with open(product_feats_file, 'r') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        for row in tsvreader:
            product_id = row[0]
            feats = np.array([[float(n) for n in row[1:]]])
            product_to_feats[product_id] = feats
    return product_to_feats

In [18]:
product_file_path = 'data-outfits/outfit_products.tsv'
category_to_label, label_to_category = map_category_to_label()
prod_to_feats = get_product_to_features()

In [24]:
X, y = make_data_inputs(product_file_path, category_to_label, prod_to_feats)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=55)

Reading file  data-outfits/outfit_products.tsv


In [25]:
rf_model = RandomForestClassifier(n_estimators=1000)
rf_model.fit(X_train, y_train)

array([ 0.00027503,  0.00031707,  0.00074915, ...,  0.00050357,
        0.00053441,  0.00194463])

In [35]:
list(zip(range(0, 1024), sorted(list(zip(range(0, 1024), rf_model.feature_importances_)), key=lambda tup: tup[1], reverse=True)))

[(0, (570, 0.0083312492148816825)),
 (1, (1010, 0.0079681707178899611)),
 (2, (317, 0.007867410589765908)),
 (3, (264, 0.0074436828044580306)),
 (4, (460, 0.0072360297275452104)),
 (5, (833, 0.0066879641655690041)),
 (6, (715, 0.0062825823780031246)),
 (7, (215, 0.0062473645937040989)),
 (8, (624, 0.0059789487679335141)),
 (9, (920, 0.0059179389323181335)),
 (10, (660, 0.0058698260317528988)),
 (11, (318, 0.0058687648862621108)),
 (12, (613, 0.0056648657532153068)),
 (13, (400, 0.005509892312851563)),
 (14, (62, 0.0054260196268603343)),
 (15, (336, 0.0053719740935621076)),
 (16, (537, 0.0052926963977602123)),
 (17, (767, 0.0052735246218314083)),
 (18, (606, 0.0052627672721452922)),
 (19, (170, 0.0049361615837032313)),
 (20, (787, 0.0047984962785503496)),
 (21, (772, 0.0046268482798611206)),
 (22, (820, 0.004539879497986777)),
 (23, (224, 0.0045151126490897116)),
 (24, (986, 0.0044060216923580914)),
 (25, (573, 0.0043882733694426829)),
 (26, (290, 0.0043144731433228511)),
 (27, (128, 0.

In [34]:
np.mean(rf_model.predict(X_test) == y_test)

0.91305329719963868