In [1]:
from __future__ import division, print_function
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
#In [2]:
DATA_DIR = ""
IMAGE_DIR = os.path.join(DATA_DIR, "holiday-photos/jpg")
WEIGHTS_DIR = os.path.join(DATA_DIR, "weights")

In [5]:
#In [3]:
def get_holiday_triples(image_dir):
    image_groups = {}
    for image_name in os.listdir(image_dir):
        base_name = image_name[0:-4]
        group_name = base_name[0:4]
        if group_name in image_groups:
            image_groups[group_name].append(image_name)
        else:
            image_groups[group_name] = [image_name]
    num_sims = 0
    image_triples = []
    group_list = sorted(list(image_groups.keys()))
    for i, g in enumerate(group_list):
        if num_sims % 100 == 0:
            print("Generated {:d} pos + {:d} neg = {:d} total image triples"
                  .format(num_sims, num_sims, 2*num_sims))
        images_in_group = image_groups[g]
        sim_pairs_it = itertools.combinations(images_in_group, 2)
        # for each similar pair, generate a corresponding different pair
        for ref_image, sim_image in sim_pairs_it:
            image_triples.append((ref_image, sim_image, 1))
            num_sims += 1
            while True:
                j = np.random.randint(low=0, high=len(group_list), size=1)[0]
                if j != i:
                    break
            dif_image_candidates = image_groups[group_list[j]]
            k = np.random.randint(low=0, high=len(dif_image_candidates), size=1)[0]
            dif_image = dif_image_candidates[k]
            image_triples.append((ref_image, dif_image, 0))
    print("Generated {:d} pos + {:d} neg = {:d} total image triples"
          .format(num_sims, num_sims, 2*num_sims))
    return image_triples

def load_vectors(vector_file):
    vec_dict = {}
    fvec = open(vector_file, "r")
    for line in fvec:
        image_name, image_vec = line.strip().split("\t")
        vec = np.array([float(v) for v in image_vec.split(",")])
        vec_dict[image_name] = vec
    fvec.close()
    return vec_dict

def preprocess_data(vector_file, train_size=0.7):
    xdata, ydata = [], []
    vec_dict = load_vectors(vector_file)
    for image_triple in image_triples:
        X1 = vec_dict[image_triple[0]]
        X2 = vec_dict[image_triple[1]]
#         xdata.append(np.multiply(X1, X2) / (np.linalg.norm(X1, 2) * np.linalg.norm(X2, 2)))
        xdata.append(np.power(np.subtract(X1, X2), 2))
        ydata.append(image_triple[2])
    X, y = np.array(xdata), np.array(ydata)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
    return Xtrain, Xtest, ytrain, ytest

def cross_validate(X, y, clf, k=10):
    best_score, best_clf = 0.0, None
    kfold = KFold(k)
    for kid, (train, test) in enumerate(kfold.split(X, y)):
        Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
        clf.fit(Xtrain, ytrain)
        ytest_ = clf.predict(Xtest)
        score = accuracy_score(ytest_, ytest)
        print("fold {:d}, score: {:.3f}".format(kid, score))
        if score > best_score:
            best_score = score
            best_clf = clf
    return best_clf, best_score

def test_report(clf, Xtest, ytest):
    ytest_ = clf.predict(Xtest)
    print("\nAccuracy Score: {:.3f}".format(accuracy_score(ytest_, ytest)))
    print("\nConfusion Matrix")
    print(confusion_matrix(ytest_, ytest))
    print("\nClassification Report")
    print(classification_report(ytest_, ytest))
    
def get_model_file(data_dir, vec_name, clf_name):
    return os.path.join(data_dir, "models", "H-{:s}-{:s}-l2.pkl"
                        .format(vec_name, clf_name))

def save_model(model, model_file):
    joblib.dump(model, model_file)

In [4]:
# In [4]:
image_triples = get_holiday_triples(IMAGE_DIR)
#In [5]:
NUM_VECTORIZERS = 5
NUM_CLASSIFIERS = 4
scores = np.zeros((NUM_VECTORIZERS, NUM_CLASSIFIERS))

Generated 0 pos + 0 neg = 0 total image triples
Generated 300 pos + 300 neg = 600 total image triples
Generated 1100 pos + 1100 neg = 2200 total image triples
Generated 1800 pos + 1800 neg = 3600 total image triples
Generated 2072 pos + 2072 neg = 4144 total image triples


In [None]:
#In [6]:
VECTOR_FILE = os.path.join(DATA_DIR, "vgg16-vectors.tsv")
Xtrain, Xtest, ytrain, ytest = preprocess_data(VECTOR_FILE)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
#(2900, 4096) (1244, 4096) (2900,) (1244,)
#Naive Bayes
#In [7]:
clf = GaussianNB()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[0, 0] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg16", "nb"))


# SVM
# In [8]:
clf = LinearSVC()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[0, 1] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg16", "svm"))


# XGBoost
# In [9]:
clf = XGBClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[0, 2] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg16", "xgb"))

# Random Forest
# In [10]:
clf = RandomForestClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[0, 3] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg16", "rf"))



(2900, 4096) (1244, 4096) (2900,) (1244,)
fold 0, score: 0.834
fold 1, score: 0.793
fold 2, score: 0.800
fold 3, score: 0.800
fold 4, score: 0.807
fold 5, score: 0.845
fold 6, score: 0.803
fold 7, score: 0.800
fold 8, score: 0.807
fold 9, score: 0.803

Accuracy Score: 0.818

Confusion Matrix
[[512  91]
 [135 506]]

Classification Report
             precision    recall  f1-score   support

          0       0.79      0.85      0.82       603
          1       0.85      0.79      0.82       641

avg / total       0.82      0.82      0.82      1244

fold 0, score: 0.910
fold 1, score: 0.910
fold 2, score: 0.876
fold 3, score: 0.879
fold 4, score: 0.900
fold 5, score: 0.845
fold 6, score: 0.876
fold 7, score: 0.907
fold 8, score: 0.910
fold 9, score: 0.903

Accuracy Score: 0.881

Confusion Matrix
[[545  46]
 [102 551]]

Classification Report
             precision    recall  f1-score   support

          0       0.84      0.92      0.88       591
          1       0.92      0.84      0.88

  if diff:


fold 0, score: 0.897


  if diff:


fold 1, score: 0.897


  if diff:


fold 2, score: 0.917


  if diff:


fold 3, score: 0.934


  if diff:


fold 4, score: 0.917


  if diff:


fold 5, score: 0.917


  if diff:


fold 6, score: 0.948


  if diff:


fold 7, score: 0.928


  if diff:


fold 8, score: 0.924


  if diff:
  if diff:


fold 9, score: 0.938

Accuracy Score: 0.924

Confusion Matrix
[[596  44]
 [ 51 553]]

Classification Report
             precision    recall  f1-score   support

          0       0.92      0.93      0.93       640
          1       0.93      0.92      0.92       604

avg / total       0.92      0.92      0.92      1244

fold 0, score: 0.876
fold 1, score: 0.859
fold 2, score: 0.890
fold 3, score: 0.831
fold 4, score: 0.890
fold 5, score: 0.872
fold 6, score: 0.897
fold 7, score: 0.841
fold 8, score: 0.852
fold 9, score: 0.852

Accuracy Score: 0.867

Confusion Matrix
[[602 120]
 [ 45 477]]

Classification Report
             precision    recall  f1-score   support

          0       0.93      0.83      0.88       722
          1       0.80      0.91      0.85       522

avg / total       0.88      0.87      0.87      1244





(2900, 4096) (1244, 4096) (2900,) (1244,)
fold 0, score: 0.790
fold 1, score: 0.797
fold 2, score: 0.779
fold 3, score: 0.772
fold 4, score: 0.803
fold 5, score: 0.776
fold 6, score: 0.841
fold 7, score: 0.831
fold 8, score: 0.786
fold 9, score: 0.810

Accuracy Score: 0.809

Confusion Matrix
[[490 109]
 [128 517]]

Classification Report
             precision    recall  f1-score   support

          0       0.79      0.82      0.81       599
          1       0.83      0.80      0.81       645

avg / total       0.81      0.81      0.81      1244

fold 0, score: 0.876
fold 1, score: 0.886
fold 2, score: 0.852
fold 3, score: 0.890
fold 4, score: 0.886
fold 5, score: 0.876
fold 6, score: 0.872
fold 7, score: 0.869
fold 8, score: 0.876
fold 9, score: 0.900

Accuracy Score: 0.891

Confusion Matrix
[[521  38]
 [ 97 588]]

Classification Report
             precision    recall  f1-score   support

          0       0.84      0.93      0.89       559
          1       0.94      0.86      0.90

  if diff:


fold 0, score: 0.893


  if diff:


fold 1, score: 0.928


  if diff:


fold 2, score: 0.893


  if diff:


fold 3, score: 0.945


  if diff:


fold 4, score: 0.941


  if diff:


fold 5, score: 0.941


  if diff:


fold 6, score: 0.910


  if diff:


fold 7, score: 0.941


  if diff:


fold 8, score: 0.928


  if diff:
  if diff:


fold 9, score: 0.917

Accuracy Score: 0.913

Confusion Matrix
[[557  47]
 [ 61 579]]

Classification Report
             precision    recall  f1-score   support

          0       0.90      0.92      0.91       604
          1       0.92      0.90      0.91       640

avg / total       0.91      0.91      0.91      1244

fold 0, score: 0.831
fold 1, score: 0.862
fold 2, score: 0.852
fold 3, score: 0.869
fold 4, score: 0.845
fold 5, score: 0.876
fold 6, score: 0.872
fold 7, score: 0.866
fold 8, score: 0.855
fold 9, score: 0.886

Accuracy Score: 0.847

Confusion Matrix
[[558 130]
 [ 60 496]]

Classification Report
             precision    recall  f1-score   support

          0       0.90      0.81      0.85       688
          1       0.79      0.89      0.84       556

avg / total       0.85      0.85      0.85      1244





(2900, 2048) (1244, 2048) (2900,) (1244,)
fold 0, score: 0.866
fold 1, score: 0.838
fold 2, score: 0.855
fold 3, score: 0.883
fold 4, score: 0.883
fold 5, score: 0.852
fold 6, score: 0.855
fold 7, score: 0.859
fold 8, score: 0.848
fold 9, score: 0.886

Accuracy Score: 0.865

Confusion Matrix
[[558 118]
 [ 50 518]]

Classification Report
             precision    recall  f1-score   support

          0       0.92      0.83      0.87       676
          1       0.81      0.91      0.86       568

avg / total       0.87      0.86      0.87      1244

fold 0, score: 0.917
fold 1, score: 0.883
fold 2, score: 0.907
fold 3, score: 0.917
fold 4, score: 0.917
fold 5, score: 0.914
fold 6, score: 0.914
fold 7, score: 0.924
fold 8, score: 0.886
fold 9, score: 0.890

Accuracy Score: 0.912

Confusion Matrix
[[529  31]
 [ 79 605]]

Classification Report
             precision    recall  f1-score   support

          0       0.87      0.94      0.91       560
          1       0.95      0.88      0.92

  if diff:


fold 0, score: 0.931


  if diff:


fold 1, score: 0.917


  if diff:


fold 2, score: 0.897


  if diff:


fold 3, score: 0.921


  if diff:


fold 4, score: 0.931


  if diff:


fold 5, score: 0.897


  if diff:


fold 6, score: 0.903


  if diff:


fold 7, score: 0.945


  if diff:


fold 8, score: 0.903


  if diff:
  if diff:


fold 9, score: 0.921

Accuracy Score: 0.916

Confusion Matrix
[[572  69]
 [ 36 567]]

Classification Report
             precision    recall  f1-score   support

          0       0.94      0.89      0.92       641
          1       0.89      0.94      0.92       603

avg / total       0.92      0.92      0.92      1244

fold 0, score: 0.852
fold 1, score: 0.848
fold 2, score: 0.797
fold 3, score: 0.810
fold 4, score: 0.834
fold 5, score: 0.807
fold 6, score: 0.831
fold 7, score: 0.838
fold 8, score: 0.807
fold 9, score: 0.876

Accuracy Score: 0.811

Confusion Matrix
[[559 186]
 [ 49 450]]

Classification Report
             precision    recall  f1-score   support

          0       0.92      0.75      0.83       745
          1       0.71      0.90      0.79       499

avg / total       0.83      0.81      0.81      1244





(2900, 2048) (1244, 2048) (2900,) (1244,)
fold 0, score: 0.838
fold 1, score: 0.855
fold 2, score: 0.866
fold 3, score: 0.862
fold 4, score: 0.862
fold 5, score: 0.852
fold 6, score: 0.845
fold 7, score: 0.821
fold 8, score: 0.869
fold 9, score: 0.866

Accuracy Score: 0.862

Confusion Matrix
[[553  97]
 [ 75 519]]

Classification Report
             precision    recall  f1-score   support

          0       0.88      0.85      0.87       650
          1       0.84      0.87      0.86       594

avg / total       0.86      0.86      0.86      1244

fold 0, score: 0.890
fold 1, score: 0.883
fold 2, score: 0.893
fold 3, score: 0.907
fold 4, score: 0.872
fold 5, score: 0.897
fold 6, score: 0.910
fold 7, score: 0.872
fold 8, score: 0.886
fold 9, score: 0.876

Accuracy Score: 0.875

Confusion Matrix
[[514  41]
 [114 575]]

Classification Report
             precision    recall  f1-score   support

          0       0.82      0.93      0.87       555
          1       0.93      0.83      0.88

  if diff:


fold 0, score: 0.921


  if diff:


fold 1, score: 0.948


  if diff:


fold 2, score: 0.931


  if diff:


fold 3, score: 0.945


  if diff:


fold 4, score: 0.931


  if diff:


fold 5, score: 0.948


  if diff:


fold 6, score: 0.941


  if diff:


fold 7, score: 0.934


  if diff:


fold 8, score: 0.938


  if diff:
  if diff:


fold 9, score: 0.962

Accuracy Score: 0.949

Confusion Matrix
[[598  33]
 [ 30 583]]

Classification Report
             precision    recall  f1-score   support

          0       0.95      0.95      0.95       631
          1       0.95      0.95      0.95       613

avg / total       0.95      0.95      0.95      1244

fold 0, score: 0.883
fold 1, score: 0.869
fold 2, score: 0.876
fold 3, score: 0.848
fold 4, score: 0.838
fold 5, score: 0.838
fold 6, score: 0.841
fold 7, score: 0.814
fold 8, score: 0.845
fold 9, score: 0.866

Accuracy Score: 0.863

Confusion Matrix
[[571 114]
 [ 57 502]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      0.83      0.87       685
          1       0.81      0.90      0.85       559

avg / total       0.87      0.86      0.86      1244





(2900, 2048) (1244, 2048) (2900,) (1244,)
fold 0, score: 0.872
fold 1, score: 0.869
fold 2, score: 0.841
fold 3, score: 0.859
fold 4, score: 0.834
fold 5, score: 0.886
fold 6, score: 0.845
fold 7, score: 0.848
fold 8, score: 0.834
fold 9, score: 0.848

Accuracy Score: 0.861

Confusion Matrix
[[569 118]
 [ 55 502]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      0.83      0.87       687
          1       0.81      0.90      0.85       557

avg / total       0.87      0.86      0.86      1244

fold 0, score: 0.941
fold 1, score: 0.948
fold 2, score: 0.928
fold 3, score: 0.928
fold 4, score: 0.955
fold 5, score: 0.945
fold 6, score: 0.924
fold 7, score: 0.903
fold 8, score: 0.934
fold 9, score: 0.928

Accuracy Score: 0.921

Confusion Matrix
[[563  37]
 [ 61 583]]

Classification Report
             precision    recall  f1-score   support

          0       0.90      0.94      0.92       600
          1       0.94      0.91      0.92

  if diff:


fold 0, score: 0.941


  if diff:


fold 1, score: 0.921


  if diff:


fold 2, score: 0.910


  if diff:


fold 3, score: 0.890


  if diff:


fold 4, score: 0.934


  if diff:


fold 5, score: 0.938


  if diff:


fold 6, score: 0.931


  if diff:


fold 7, score: 0.917


  if diff:


fold 8, score: 0.914


  if diff:
  if diff:


fold 9, score: 0.921

Accuracy Score: 0.924

Confusion Matrix
[[583  53]
 [ 41 567]]

Classification Report
             precision    recall  f1-score   support

          0       0.93      0.92      0.93       636
          1       0.91      0.93      0.92       608

avg / total       0.92      0.92      0.92      1244

fold 0, score: 0.848


In [None]:
# VGG-19
# In [11]:
VECTOR_FILE = os.path.join(DATA_DIR, "vgg19-vectors.tsv")
Xtrain, Xtest, ytrain, ytest = preprocess_data(VECTOR_FILE)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
# (2900, 4096) (1244, 4096) (2900,) (1244,)
# Naive Bayes
# In [12]:
clf = GaussianNB()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[1, 0] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg19", "nb"))


# SVM
# In [13]:
clf = LinearSVC()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[1, 1] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg19", "svm"))

# XGBoost
# In [14]:
clf = XGBClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[1, 2] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg19", "xgb"))


# Random Forest
# In [15]:
clf = RandomForestClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[1, 3] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "vgg19", "rf"))

In [6]:
# Inception V3
# In [16]:
VECTOR_FILE = os.path.join(DATA_DIR, "inception-vectors.tsv")
Xtrain, Xtest, ytrain, ytest = preprocess_data(VECTOR_FILE)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
# (2900, 2048) (1244, 2048) (2900,) (1244,)
# Naive Bayes
# In [17]:
clf = GaussianNB()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[2, 0] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "inceptionv3", "nb"))


# SVM
# In [18]:
clf = LinearSVC()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[2, 1] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "inceptionv3", "svm"))


# XGBoost
# In [19]:
clf = XGBClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[2, 2] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "inceptionv3", "xgb"))

# Random Forest
# In [20]:
clf = RandomForestClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[2, 3] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "inceptionv3", "rf"))



(2900, 2048) (1244, 2048) (2900,) (1244,)
fold 0, score: 0.821
fold 1, score: 0.879
fold 2, score: 0.821
fold 3, score: 0.852
fold 4, score: 0.828
fold 5, score: 0.866
fold 6, score: 0.879
fold 7, score: 0.838
fold 8, score: 0.852
fold 9, score: 0.876

Accuracy Score: 0.873

Confusion Matrix
[[575 104]
 [ 54 511]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      0.85      0.88       679
          1       0.83      0.90      0.87       565

avg / total       0.88      0.87      0.87      1244

fold 0, score: 0.893
fold 1, score: 0.924
fold 2, score: 0.903
fold 3, score: 0.903
fold 4, score: 0.900
fold 5, score: 0.914
fold 6, score: 0.921
fold 7, score: 0.890
fold 8, score: 0.886
fold 9, score: 0.897

Accuracy Score: 0.912

Confusion Matrix
[[560  40]
 [ 69 575]]

Classification Report
             precision    recall  f1-score   support

          0       0.89      0.93      0.91       600
          1       0.93      0.89      0.91

  if diff:


fold 0, score: 0.907


  if diff:


fold 1, score: 0.934


  if diff:


fold 2, score: 0.907


  if diff:


fold 3, score: 0.938


  if diff:


fold 4, score: 0.928


  if diff:


fold 5, score: 0.931


  if diff:


fold 6, score: 0.959


  if diff:


fold 7, score: 0.893


  if diff:


fold 8, score: 0.907


  if diff:
  if diff:


fold 9, score: 0.872

Accuracy Score: 0.916

Confusion Matrix
[[588  63]
 [ 41 552]]

Classification Report
             precision    recall  f1-score   support

          0       0.93      0.90      0.92       651
          1       0.90      0.93      0.91       593

avg / total       0.92      0.92      0.92      1244

fold 0, score: 0.831
fold 1, score: 0.831
fold 2, score: 0.821
fold 3, score: 0.848
fold 4, score: 0.855
fold 5, score: 0.879
fold 6, score: 0.862
fold 7, score: 0.838
fold 8, score: 0.817
fold 9, score: 0.817

Accuracy Score: 0.834

Confusion Matrix
[[582 160]
 [ 47 455]]

Classification Report
             precision    recall  f1-score   support

          0       0.93      0.78      0.85       742
          1       0.74      0.91      0.81       502

avg / total       0.85      0.83      0.84      1244



In [None]:
# ResNet 50
# In [21]:
VECTOR_FILE = os.path.join(DATA_DIR, "resnet-vectors.tsv")
Xtrain, Xtest, ytrain, ytest = preprocess_data(VECTOR_FILE)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
# (2900, 2048) (1244, 2048) (2900,) (1244,)
# Naive Bayes
# In [22]:
clf = GaussianNB()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[3, 0] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "resnet50", "nb"))


# SVM
# In [23]:
clf = LinearSVC()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[3, 1] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "resnet50", "svm"))


# XGBoost
# In [24]:
clf = XGBClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[3, 2] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "resnet50", "xgb"))


# Random Forest
# In [25]:
clf = RandomForestClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[3, 3] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "resnet50", "rf"))

In [None]:
# Xception
# In [26]:
VECTOR_FILE = os.path.join(DATA_DIR, "xception-vectors.tsv")
Xtrain, Xtest, ytrain, ytest = preprocess_data(VECTOR_FILE)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
# (2900, 2048) (1244, 2048) (2900,) (1244,)
# Naive Bayes
# In [27]:
clf = GaussianNB()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[4, 0] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "xception", "nb"))


# SVM
# In [28]:
clf = LinearSVC()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[4, 1] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "xception", "svm"))


# XGBoost
# In [29]:
clf = XGBClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[4, 2] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "xception", "xgb"))


# Random Forest
# In [30]:
clf = RandomForestClassifier()
best_clf, best_score = cross_validate(Xtrain, ytrain, clf)
scores[4, 3] = best_score
test_report(best_clf, Xtest, ytest)
save_model(best_clf, get_model_file(DATA_DIR, "xception", "rf"))

In [None]:
# In [31]:
scores
# Out[31]:

# In [32]:
width=0.15
plt.bar(np.arange(NUM_CLASSIFIERS), scores[0], width, color="r", label="VGG-16")
plt.bar(np.arange(NUM_CLASSIFIERS)+width, scores[1], width, color="b", label="VGG-19")
plt.bar(np.arange(NUM_CLASSIFIERS)+2*width, scores[2], width, color="g", label="Inception-V3")
plt.bar(np.arange(NUM_CLASSIFIERS)+3*width, scores[3], width, color="y", label="ResNet-50")
plt.bar(np.arange(NUM_CLASSIFIERS)+4*width, scores[4], width, color="cyan", label="XCeption")
plt.legend(loc=4)
plt.ylabel("accuracy")
plt.xticks(np.arange(NUM_CLASSIFIERS)+2*width, ["NaiveBayes", "SvM", "XGBoost", "RandomForest"],
          rotation=30)
plt.title("squared difference of vectors")