## Cluster

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from sklearn import metrics, preprocessing
from sklearn.cluster import AgglomerativeClustering, KMeans, Birch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [47]:
# load in feature data
train_df = pd.read_csv('train_features.csv')
test_df = pd.read_csv('test_features.csv')
train_df = train_df.drop(columns='Unnamed: 0')
test_df = test_df.drop(columns='Unnamed: 0')

In [49]:
train_df

Unnamed: 0,picture_name,material,feature_id,picture_id,0,1,2,3,4,5,...,22,23,24,25,26,27,28,29,30,31
0,cardboard103.jpg,cardboard,0,5,44,92,116,95,69,156,...,46,189,6,235,112,128,219,253,69,241
1,cardboard103.jpg,cardboard,1,5,54,45,243,84,214,249,...,168,96,55,8,5,171,229,121,71,166
2,cardboard103.jpg,cardboard,2,5,15,62,180,161,189,191,...,75,242,206,85,5,247,218,237,9,69
3,cardboard103.jpg,cardboard,3,5,63,7,2,163,180,183,...,104,190,170,213,171,196,151,143,248,213
4,cardboard103.jpg,cardboard,4,5,15,150,113,139,252,181,...,45,215,14,30,58,206,158,142,169,213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216132,trash91.jpg,trash,495,2518,15,246,99,1,242,251,...,44,254,46,92,47,230,207,199,139,213
216133,trash91.jpg,trash,496,2518,64,155,159,221,40,202,...,35,13,197,182,114,16,64,162,69,121
216134,trash91.jpg,trash,497,2518,226,231,99,12,121,80,...,166,223,50,116,158,159,87,84,255,156
216135,trash91.jpg,trash,498,2518,190,143,74,188,242,146,...,4,255,242,247,254,195,181,244,174,215


In [50]:
test_df

Unnamed: 0,picture_name,material,feature_id,picture_id,0,1,2,3,4,5,...,22,23,24,25,26,27,28,29,30,31
0,cardboard1.jpg,cardboard,0,0,170,149,97,138,213,243,...,125,159,40,84,138,238,159,14,244,245
1,cardboard1.jpg,cardboard,1,0,11,220,0,159,119,147,...,15,255,44,118,219,195,30,44,185,223
2,cardboard1.jpg,cardboard,2,0,129,216,120,89,116,201,...,37,119,70,14,178,68,95,152,96,233
3,cardboard1.jpg,cardboard,3,0,10,232,240,179,231,57,...,202,227,78,87,222,43,150,221,141,206
4,cardboard1.jpg,cardboard,4,0,131,238,240,147,230,56,...,142,227,14,86,222,174,134,213,129,141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814298,trash99.jpg,trash,495,2526,31,87,235,207,119,118,...,45,237,118,127,166,129,247,255,252,253
814299,trash99.jpg,trash,496,2526,123,16,244,204,45,85,...,115,209,147,249,248,187,95,140,124,119
814300,trash99.jpg,trash,497,2526,122,18,238,220,47,82,...,167,200,83,115,88,155,87,12,124,95
814301,trash99.jpg,trash,498,2526,143,166,113,235,110,57,...,111,211,94,217,171,236,247,207,113,225


In [52]:
# make X and y
feat_nums = list(str(i) for i in range(32))
Xtrain = train_df[feat_nums]
Xtest = test_df[feat_nums]
ytrain = train_df['material']
ytest = test_df['material']

In [53]:
Xtrain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,44,92,116,95,69,156,165,178,84,94,...,46,189,6,235,112,128,219,253,69,241
1,54,45,243,84,214,249,166,194,187,176,...,168,96,55,8,5,171,229,121,71,166
2,15,62,180,161,189,191,155,210,199,189,...,75,242,206,85,5,247,218,237,9,69
3,63,7,2,163,180,183,74,121,21,77,...,104,190,170,213,171,196,151,143,248,213
4,15,150,113,139,252,181,186,219,31,89,...,45,215,14,30,58,206,158,142,169,213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216132,15,246,99,1,242,251,187,219,111,233,...,44,254,46,92,47,230,207,199,139,213
216133,64,155,159,221,40,202,33,51,236,28,...,35,13,197,182,114,16,64,162,69,121
216134,226,231,99,12,121,80,142,111,155,75,...,166,223,50,116,158,159,87,84,255,156
216135,190,143,74,188,242,146,94,231,57,15,...,4,255,242,247,254,195,181,244,174,215


In [54]:
ytrain

0         cardboard
1         cardboard
2         cardboard
3         cardboard
4         cardboard
            ...    
216132        trash
216133        trash
216134        trash
216135        trash
216136        trash
Name: material, Length: 216137, dtype: object

In [59]:
# standardize and PCA data
scaler = StandardScaler().fit(Xtrain)
std_Xtrain = scaler.transform(Xtrain)
std_Xtest = scaler.transform(Xtest)
pca = PCA(n_components=25)
pca.fit(std_Xtrain)
pca_Xtrain = pca.transform(std_Xtrain)
pca_Xtest = pca.transform(std_Xtest)

In [60]:
print(pca.explained_variance_ratio_.cumsum())

[0.1147732  0.21874557 0.29676284 0.36462422 0.41931737 0.46982177
 0.51572129 0.55432155 0.59054738 0.62433349 0.65380589 0.68159738
 0.7070253  0.73065149 0.75318034 0.77435708 0.79395408 0.81311517
 0.83095159 0.84775115 0.86314136 0.87801424 0.89218216 0.90590341
 0.91917458]


In [128]:
def add_cluster_cols(df, n):
    from collections import defaultdict as dd
    cluster_hist = {}
    for pic_name in df.picture_name.unique():
        cluster_hist[pic_name] = dd(int)
        for row in df[df['picture_name'] == pic_name].iterrows():
            cluster_tag = 'cluster_%s'%row[1]['cluster']
            cluster_hist[pic_name][cluster_tag] += 1
    cluster_df = pd.DataFrame.from_dict(cluster_hist, orient='index')
    cluster_df = cluster_df.reset_index().rename(columns={'index':'picture_name'})
    sorted_colnames= ['picture_name']
    sorted_colnames.extend(['cluster_%s'%i for i in range(50)])
    cluster_df = cluster_df[sorted_colnames]
    return cluster_df

In [144]:
%%time
materials = list(np.unique(train_df['material']))
for n in [50, 100, 200, 300, 500]:
# for n in [50]:
    print()
    print('################')
    print('%s clusters:'%n)
    # fit cluster
    print('fitting clusters')
    kmeans = KMeans(n_clusters=n).fit(pca_Xtrain)
    print('predicting cluster labels')
    # get train labels
    train_labels = kmeans.predict(pca_Xtrain)
    train_df['cluster'] = train_labels
    train_df['cluster'] = train_df['cluster'].astype(str)
    # get test labels
    test_labels = kmeans.predict(pca_Xtest)
    test_df['cluster'] = test_labels
    test_df['cluster'] = test_df['cluster'].astype(str)
    ## make visual bow
    print('making visual bow')
    train_dfdict = {}
    test_dfdict = {}
    
    for m in materials:
        print('  %s - rearranging cols'%m)
        # init material specific df
        sub_train_df = train_df[['picture_name', 'cluster']][train_df['material'] == m].copy()
        sub_test_df = test_df[['picture_name', 'cluster']][test_df['material'] == m].copy()
        # get indices from length
        sub_train_df.index = range(len(sub_train_df))
        sub_test_df.index = range(len(sub_test_df))
        # translate categorical columns to features in a new df
        sub_train_df = add_cluster_cols(sub_train_df, n)
        sub_test_df = add_cluster_cols(sub_test_df, n)
        # save to dict by material type
        train_dfdict[m] = sub_train_df
        test_dfdict[m] = sub_test_df
    
    # generate whole dataframe
    focus_cols = ['picture_name']
    focus_cols += ['cluster_%s'%i for i in range(n)]
    
    model_train_df = pd.DataFrame(columns = focus_cols)
    model_test_df = pd.DataFrame(columns = focus_cols)
    
    for m in materials:
        print(' %s - adding to main df'%m)
        # get dfs
        sub_train_df = train_dfdict[m]
        sub_test_df = test_dfdict[m]
        # add to main model df
        model_train_df = model_train_df.append(sub_train_df)
        model_test_df = model_test_df.append(sub_test_df)
    
    model_train_df.fillna(0, inplace=True)
    model_test_df.fillna(0, inplace=True)
    model_train_df = model_train_df[focus_cols]
    model_test_df = model_test_df[focus_cols]
    print(model_train_df.isnull().values.any())
    
    # remerge material labels
    name_mat_train_df = train_df[['material', 'picture_name']].copy()
    name_mat_train_df.drop_duplicates(subset=['material', 'picture_name'], inplace=True)
    name_mat_test_df = test_df[['material', 'picture_name']].copy()
    name_mat_test_df.drop_duplicates(subset=['material', 'picture_name'], inplace=True)
    model_train_df = pd.merge(model_train_df, name_mat_train_df, how='left', on='picture_name')
    model_test_df = pd.merge(model_test_df, name_mat_test_df, how='left', on='picture_name')
    
    # write to csv
    print('writing to csv')
    model_train_df.to_csv('cluster_dfs/train_kmeans%s.csv'%n)
    model_test_df.to_csv('cluster_dfs/test_kmeans%s.csv'%n)


################
50 clusters:
fitting clusters
predicting cluster labels
making visual bow
  cardboard - rearranging cols
  glass - rearranging cols
  metal - rearranging cols
  paper - rearranging cols
  plastic - rearranging cols
  trash - rearranging cols
 cardboard - adding to main df
 glass - adding to main df
 metal - adding to main df
 paper - adding to main df
 plastic - adding to main df
 trash - adding to main df
False
writing to csv

################
100 clusters:
fitting clusters
predicting cluster labels
making visual bow
  cardboard - rearranging cols
  glass - rearranging cols
  metal - rearranging cols
  paper - rearranging cols
  plastic - rearranging cols
  trash - rearranging cols
 cardboard - adding to main df
 glass - adding to main df
 metal - adding to main df
 paper - adding to main df
 plastic - adding to main df
 trash - adding to main df
False
writing to csv

################
200 clusters:
fitting clusters
predicting cluster labels
making visual bow
  cardbo