In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans
import csv
from sklearn.neighbors import KDTree, NearestNeighbors
import cv2
import matplotlib.pyplot as plt

%matplotlib inline

## Clustering Images

In [15]:
def get_product_to_features(product_feats_file):
    product_to_feats = {}
    rf_feature_import_file = 'rf_feat_import.dat'
    all_feat_importances = np.load(rf_feature_import_file)
    top_features = sorted(list(zip(range(0, 1024), all_feat_importances)), key=lambda tup: tup[1], reverse=True)
    bottom_feature_indexes = [f[0] for f in top_features[150:]]
    with open(product_feats_file, 'r') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        for row in tsvreader:
            product_id = row[0]
            feats_stored = [float(n) for n in row[1:]]
            feats_reduced = np.delete(feats_stored, bottom_feature_indexes)
            feats = np.array([feats_reduced])
            product_to_feats[product_id] = feats
    return product_to_feats


def make_data_inputs(product_file, product_to_features):
    print('Reading file ', product_file)
    X_product_features = []
    X_product_ids = []
    with open(product_file, 'r') as f:
        for i, line in enumerate(f.readlines()[:]):
            try:
                l = line.split('\t')
                product_id = l[0]
                product_category = l[2]
                product_name = l[3]
                
                features = product_to_features[product_id][0]
                X_product_features.append(features)
                X_product_ids.append((product_id, product_category, product_name))

            except Exception as e:
                print(e)
    return X_product_features, X_product_ids
    

In [5]:
product_feats_file = 'data-outfits/outfit_product_features_small.tsv'
outfit_permutations_file = 'data-outfits/outfit_permutations_small.tsv'
prods_to_feats = get_product_to_features(product_feats_file)

In [6]:
len(prods_to_feats['641336745'][0])

150

In [20]:
X_features, X_prods = make_data_inputs('data-outfits/outfit_products_small.tsv', prods_to_feats)
X_prod_ids = [p[0] for p in X_prods]
X_prod_categories = [p[1] for p in X_prods]
len(X_features)

Reading file  data-outfits/outfit_products_small.tsv


18673

In [24]:
km = MiniBatchKMeans(n_clusters=15)
product_clusters = km.fit_predict(X_features)
df_product_cluster = pd.DataFrame({'Id': X_prod_ids, 'Cluster': product_clusters, 'Category': X_prod_categories})
df_product_cluster.head(100)

Unnamed: 0,Category,Cluster,Id
0,womens-shoes,6,650970985
1,handbags,2,454257705
2,womens-shoes,1,544695778
3,womens-tops,4,660051520
4,handbags,14,602367776
5,womens-tops,0,605272038
6,jewelry,13,313376737
7,skirts,8,624270178
8,jewelry,3,629465287
9,handbags,14,613159476


In [25]:
def display_images(image_paths):
    image_count = len(image_paths)
    fig, ax = plt.subplots(1, image_count, figsize=(image_count*5, 4))
    
    for i, image_file in enumerate(image_paths):
        image = cv2.imread(image_file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    
        ax[i].imshow(image, interpolation='nearest')
        ax[i].axis("off")
    plt.show()

def display_product_images(product_ids):
    image_files = ['data-outfits/images_small/' + product_id + '.jpg' for product_id in product_ids]
    display_images(image_files)

    
def get_prod_feat_list():
    index_to_prod = {}
    prod_features = []
    for i, prod_feat in enumerate(prods_to_feats.items()):
        prod_id, feat = prod_feat
        index_to_prod[i] = prod_id
        prod_features.append(feat[0])
    return index_to_prod, np.array(prod_features)

index_to_prod, prod_features = get_prod_feat_list()
print('Making Nearest Neighbors ...')
#neighbors_model = NearestNeighbors(n_neighbors=20, metric='cosine', algorithm='brute')
neighbors_model = NearestNeighbors(n_neighbors=20, metric='l2', algorithm='auto')
neighbors_model.fit(prod_features)

Making Nearest Neighbors ...


NearestNeighbors(algorithm='auto', leaf_size=30, metric='l2',
         metric_params=None, n_jobs=1, n_neighbors=20, p=2, radius=1.0)

In [21]:
a = np.array([[1,2, 3]])
b = np.array([[4,5, 6]])
ab = np.concatenate((a, b, a), axis=1)