In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
import csv
from sklearn.neighbors import KDTree, NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

## Get Training Inputs Model

In [22]:
def map_category_to_label():
    clothes_categories = ['dresses', 'handbags', 'hats', 'jackets', 'jeans', 
                          'jewelry', 'shorts', 'skirts', 'sunglasses', 'sweaters', 
                          'sweatshirts', 'womens-outerwear', 'womens-pants', 
                          'womens-shoes', 'womens-tops']
    labels = range(0, len(clothes_categories))
    return dict(zip(clothes_categories, labels)), dict(zip(labels, clothes_categories))

def make_data_inputs(product_file, category_mapping, prod_feat_mapping, dropped_indexes):
    X_product_features = []
    y_label = []
    print('Reading file ', product_file)
    with open(product_file, 'r') as f:
        for i, line in enumerate(f.readlines()[:]):
            try:
                l = line.split('\t')
                product_id = l[0].strip()
                prod_feats = prod_feat_mapping[product_id][0]
                prod_feats_reduced = np.delete(prod_feats, dropped_indexes)
                product_category_text = l[2].strip()
                product_label = category_mapping[product_category_text]
                y_label.append(product_label)
                X_product_features.append(prod_feats_reduced)
            except Exception as e:
                print(e)
    return np.array(X_product_features), np.array(y_label)

def get_product_to_features():
    product_feats_file = 'data-outfits/outfit_product_features.tsv'
    product_to_feats = {}
    with open(product_feats_file, 'r') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        for row in tsvreader:
            product_id = row[0]
            feats = np.array([[float(n) for n in row[1:]]])
            product_to_feats[product_id] = feats
    return product_to_feats

In [23]:
rf_feature_import_file = 'rf_feat_import.dat'
# rf_model.feature_importances_.dump(rf_feature_import_file)
all_feat_importances = np.load(rf_feature_import_file)
product_file_path = 'data-outfits/outfit_products.tsv'
top_features = sorted(list(zip(range(0, 1024), all_feat_importances)), key=lambda tup: tup[1], reverse=True)
category_to_label, label_to_category = map_category_to_label()
prod_to_feats = get_product_to_features()

In [39]:
bottom_feature_indexes = [f[0] for f in top_features[200:]]
X, y = make_data_inputs(product_file_path, category_to_label, prod_to_feats, bottom_feature_indexes)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=5)

Reading file  data-outfits/outfit_products.tsv


In [36]:
rf_model = RandomForestClassifier(n_estimators=1000)
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [37]:
np.mean(rf_model.predict(X_test) == y_test)

0.91117802032367334

In [38]:
len(X_test[0]), len(X_test)

(300, 5314)

In [34]:
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
index = [0, 1]
new_a = np.delete(a, index)
new_a

array([3, 4, 5, 6, 7, 8, 9])