In [1]:
import pandas as pd
import numpy as np
import os, sys, json, pickle, time
from collections import Counter

In [2]:
with open('../data/512_train.pickle','rb') as f:
  train_data = pickle.load(f)
  
with open('../data/512_test.pickle','rb') as f:
  test_data = pickle.load(f)
  
with open('../data/512_validation.pickle','rb') as f:
  validation_data = pickle.load(f)

In [12]:
Xtrain, ytrain = train_data[:,1:-1], train_data[:,-1]
Xvalidation, yvalidation = validation_data[:,1:-1], validation_data[:,-1]
Xtest = test_data[:,1:]
Xcombined = np.r_[Xtrain,Xvalidation]
ytrn = np.r_[ytrain,yvalidation]

In [35]:
Xcombined.shape, ytrn.shape

((198480, 512), (198480,))

In [6]:
test_id = [int(i) for i in test_data[:,0]]

In [61]:
# The submission requires 12800 entries
# Only a couple pictures are deleted
# So we randomly sample classes to fill up missing classes
# Potentially we can look a some kind of distribution of frequency to sample from


with open('../data/furniture_train.json') as f:
  jsn_file = json.load(f)

trn_labels = [i['label_id'] for i in jsn_file['annotations']]

prob_dist = np.array(list(Counter(trn_labels).values()))
prob_dist += 50
prob_dist = prob_dist/prob_dist.sum()

def generate_submision(test_id,prediction, prob_dist = prob_dist):
  test_id = [int(i) for i in test_id]
  prediction = [int(i) for i in prediction]
  all_id = set(range(1,12801))
  missing = list(all_id - set(test_id))
  rand_label = np.random.choice(range(1,len(prob_dist)+1), len(missing), p = prob_dist)

  pred = [int(i) for i in prediction]
  pred.extend(rand_label)
  test_id.extend(missing)

  answers = pd.DataFrame(columns = ['id','predicted'])
  answers['predicted'] = pred

  answers['id'] = test_id
  return answers

# Forward feature selection
---

Let first try what they did in the exercises. So forward feature selection:
Lets (as in the exercise) try to find 5 features that are "good" using a random sample of 10000.

---

In [9]:
from sklearn.neighbors import KNeighborsClassifier

#numpy.random.seed(1)
rnd_list = np.random.randint(0, train_data.shape[0],10000)
rnd_trn, rnd_val = rnd_list[:8000], rnd_list[8000:]

Xtrn, ytrn = Xtrain[rnd_trn], ytrain[rnd_trn]
Xval, yval = Xtrain[rnd_val], ytrain[rnd_val]

In [10]:
start = time.time()

good_features = []
val_scores = []

feature_list = list(range(Xtrain.shape[1]))

for j in range(1,6):
    scores = []
    for i in feature_list:
        feature_slice = good_features+[i]
        X = Xtrn[:,feature_slice]
        model = KNeighborsClassifier(n_neighbors=10)
        model.fit(X, ytrn)
        scores.append(1- model.score(Xval[:,feature_slice],yval))
    
    best_ind = np.argmin(scores)
    best_feature = feature_list[best_ind]
    
    val_scores.append(scores[best_ind])
    good_features.append(best_feature)
    del feature_list[best_ind]
    print(good_features)
    
print('Seconds to run: {}'.format(time.time() - start))

[175]
[175, 181]
[175, 181, 8]
[175, 181, 8, 227]
[175, 181, 8, 227, 38]
Seconds to run: 109.23191905021667


In [11]:
val_scores #Note these are error scores

[0.974, 0.9655, 0.9605, 0.9515, 0.944]

### Conclusion

It seems that 5 features is not enough to get a proper score. maybe 10000 is also not enough to do this. 
Also the exercise data had 54 features, we have 256, so maybe cranking up the number of features desired. 

---
# PCA

Also to furter reduce the number of features, lets look at PCA

----

In [13]:
from sklearn.decomposition import PCA

pca= PCA()
pca.fit(Xcombined)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
np.where(pca.explained_variance_ratio_.cumsum()> 0.85)[0][0]

68

About 85% of the variance in data can be expressed using only 68 principle components. This seems like a nice number.

So lets transform the data and run a couple classifiers.

In [26]:
TXtrain = pca.transform(Xtrain)[:,:68]
TXvalidation = pca.transform(Xvalidation)[:,:68]
TXtest = pca.transform(Xtest)[:,:68]
TXtrn = np.r_[TXtrain,TXvalidation]

### K-Nearest Neighbors

In [22]:
# K nearest neighbor classifier
# Score is the mean accuracy

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(TXtrain,ytrain)
knn.score(TXvalidation, yvalidation)

0.3152639087018545

### Random Forest

In [23]:
# Random Forest Classifier
# Score is the mean accuracy

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 10)
rfc.fit(TXtrain, ytrain)
rfc.score(TXvalidation,yvalidation)

0.20684736091298145

### Multilayer Perceptron Classifier



In [24]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier((256), verbose =True, max_iter = 200, warm_start = True, batch_size = 500, early_stopping = True)

In [25]:
# A multilayer Preceptron
# Score is the mean accuracy again

mlp.fit(TXtrn, ytrn)


Iteration 1, loss = 3.31036376
Validation score: 0.298368
Iteration 2, loss = 2.72660043
Validation score: 0.331217
Iteration 3, loss = 2.61344649
Validation score: 0.348499
Iteration 4, loss = 2.54626077
Validation score: 0.355250
Iteration 5, loss = 2.49556186
Validation score: 0.359986
Iteration 6, loss = 2.45674818
Validation score: 0.367543
Iteration 7, loss = 2.42418411
Validation score: 0.371020
Iteration 8, loss = 2.39593239
Validation score: 0.372380
Iteration 9, loss = 2.37274409
Validation score: 0.376108
Iteration 10, loss = 2.35240834
Validation score: 0.377821
Iteration 11, loss = 2.33473572
Validation score: 0.379736
Iteration 12, loss = 2.31881972
Validation score: 0.382104
Iteration 13, loss = 2.30556054
Validation score: 0.383364
Iteration 14, loss = 2.29238666
Validation score: 0.385883
Iteration 15, loss = 2.28173481
Validation score: 0.385631
Iteration 16, loss = 2.27082250
Validation score: 0.385429
Iteration 17, loss = 2.26200357
Validation score: 0.386336
Iterat

MLPClassifier(activation='relu', alpha=0.0001, batch_size=500, beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=256, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=True)

In [27]:
with open('mlp_b500_l256_early_stopping_T68.pickle','wb') as f:
  pickle.dump(mlp,f)
  
prediction = mlp.predict(TXtest)

In [67]:
answers = generate_submision(te_data[:,0],prediction)

ValueError: Length of values does not match length of index

In [69]:
len(prediction), len(tes)

12704

In [31]:
answers.to_csv('512_prediction_b500_l256_early_T68.csv', index = False)

-----

# Running without PCA

In [32]:
mlp2 = MLPClassifier((256), verbose =True, max_iter = 200, warm_start = True, batch_size = 500, early_stopping = True)

In [36]:
mlp2.fit(Xcombined, ytrn)

Iteration 1, loss = 3.29350551
Validation score: 0.310107
Iteration 2, loss = 2.66253791
Validation score: 0.359180
Iteration 3, loss = 2.50039962
Validation score: 0.379484
Iteration 4, loss = 2.40327977
Validation score: 0.392533
Iteration 5, loss = 2.33633280
Validation score: 0.404575
Iteration 6, loss = 2.28257062
Validation score: 0.412183
Iteration 7, loss = 2.23922688
Validation score: 0.413392
Iteration 8, loss = 2.20296474
Validation score: 0.419690
Iteration 9, loss = 2.17168363
Validation score: 0.424929
Iteration 10, loss = 2.13950857
Validation score: 0.426743
Iteration 11, loss = 2.11361847
Validation score: 0.429565
Iteration 12, loss = 2.09076067
Validation score: 0.429414
Iteration 13, loss = 2.06803216
Validation score: 0.431782
Iteration 14, loss = 2.04532415
Validation score: 0.435611
Iteration 15, loss = 2.02631241
Validation score: 0.433797
Iteration 16, loss = 2.00783199
Validation score: 0.433746
Iteration 17, loss = 1.99024556
Validation score: 0.432487
Valida

MLPClassifier(activation='relu', alpha=0.0001, batch_size=500, beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=256, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=True)

It seems that we get still some nice improvement not using PCA. Still but we also took 7 times more features. So still maybe some kind of PCA might be wanted. 

In [59]:
predict = mlp2.predict(Xtest)

In [63]:
submission = generate_submision(test_data[:,0], predict)

In [64]:
answers.to_csv('512_prediction_b500_l256_early.csv', index = False)