### Install keras and tensorflow:

In [0]:
!conda install -c conda-forge keras --yes
!conda install -c conda-forge tensorflow --yes

### Import libraries:

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import os

from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

### Data import:

In [0]:
folder_path = 'food/'

img_width, img_height = 224, 224

# load all images into a list
file_list = sorted(os.listdir(folder_path))
images = []
for img in file_list:
    if img == '.DS_Store':  # ignore stupid fookin mac file that wont go away
        continue
    img = os.path.join(folder_path, img)
    img = image.load_img(img, target_size=(img_width, img_height))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    images.append(img)

# stack up images list to pass for prediction
images = np.vstack(images)

### Load pre-trained model:

In [0]:
model = ResNet50(weights='imagenet')

Instructions for updating:
Colocations handled automatically by placer.


### Predict:

In [0]:
predictions = model.predict(images, batch_size=64, verbose=1)



### Save:

In [0]:
# Save top 3 label probabilities as the 3 features describing image:
predictions_decoded = np.array(decode_predictions(predictions, top=3))
predictions_decoded = predictions_decoded[:, :, 1]

df = pd.DataFrame(predictions_decoded, columns=['first', 'second', 'third'])
print(df)

df.to_csv('food_features', index=False)

           first         second         third
0        thimble      trilobite        packet
1         potpie         bakery         dough
2      ice_cream  mashed_potato         dough
3      ice_cream          dough     meat_loaf
4        burrito        pretzel     carbonara
...          ...            ...           ...
9995   ice_cream     neck_brace      lipstick
9996  strawberry      ice_cream      baseball
9997   meat_loaf          pizza  wooden_spoon
9998   soup_bowl     frying_pan         loupe
9999       dough           wool        mortar

[10000 rows x 3 columns]


# Load food features and one hot encode:

Now we need to load the newly generated features, train a neural net on the triplets with the new features and see what happens:)

Load food features:

In [72]:
df_features = pd.read_csv("food_features") 
print(df_features)

           first         second         third
0        thimble      trilobite        packet
1         potpie         bakery         dough
2      ice_cream  mashed_potato         dough
3      ice_cream          dough     meat_loaf
4        burrito        pretzel     carbonara
...          ...            ...           ...
9995   ice_cream     neck_brace      lipstick
9996  strawberry      ice_cream      baseball
9997   meat_loaf          pizza  wooden_spoon
9998   soup_bowl     frying_pan         loupe
9999       dough           wool        mortar

[10000 rows x 3 columns]


Load triplets:

In [73]:
df_triplets = pd.read_csv('train_triplets.txt', sep=" ", header=None)
print(df_triplets)

          0     1     2
0      2461  3450  2678
1      2299  2499  4987
2      4663  1056  3029
3      4532  1186  1297
4      3454  3809  2204
...     ...   ...   ...
59510   466  2952  2530
59511  2646  3580  2359
59512  3255  4844  4334
59513  2136  4619   161
59514  2509  2552  3406

[59515 rows x 3 columns]


Somewhere around here we need to split into training and validation set. 

The issue is that the validation set triplets and training set triplets should not include any of the same images for the evaluation to be accurate. 

Maybe the easiest way to do this is to take some portion of the data as train set, and then iterate through the rest and if none of the three images are in the train set it goes in the validation set, otherwise if none of the three images are in the validation set it goes in the train set, otherwise it is discarded? Then tune the size of the initial train set to get ish the percent of the split we want? Very sub-optimal...

In [74]:
df_triplets_train = pd.DataFrame(columns = [0, 1, 2])
df_triplets_val = pd.DataFrame(columns = [0, 1, 2])

N = 325
for idx, row in df_triplets.iterrows():
  if idx < N:  # first N rows go in train
    df_triplets_train = df_triplets_train.append(row)
  elif all(x not in df_triplets_train.values for x in [row[0], row[1], row[2]]):  # else if images not in train add to val
    df_triplets_val = df_triplets_val.append(row)
  elif all(x not in df_triplets_val.values for x in [row[0], row[1], row[2]]):  # else if images not in val add to train
    df_triplets_train = df_triplets_train.append(row)
  # else we discard triplet
  
df_triplets_train = df_triplets_train.reset_index(drop=True)
df_triplets_val = df_triplets_val.reset_index(drop=True)

print(df_triplets_train)
print(df_triplets_val)

          0     1     2
0      2461  3450  2678
1      2299  2499  4987
2      4663  1056  3029
3      4532  1186  1297
4      3454  3809  2204
...     ...   ...   ...
15887   450   219  4712
15888   236  1229  4090
15889   880  1818   621
15890   466  2952  2530
15891  2646  3580  2359

[15892 rows x 3 columns]
         0     1     2
0      647  4571  3011
1      683  2848  2039
2     4404  2384  2478
3     1019  4241  4114
4     4108  1020  1523
...    ...   ...   ...
5000  4020  2390  1285
5001  4990  4028  2647
5002  1716  4431  3398
5003  1550  1222  1140
5004  1940    38  2869

[5005 rows x 3 columns]


Problem: all the training data has label 1 right now.

Attempt at solution: swap B and C for every other row, such that the labels are [1 0 1 0 1 ...].

In [75]:
for i, row in df_triplets_train.iterrows():
  if i % 2 == 1:
    temp = row[1]
    df_triplets_train.at[i,1] = row[2]
    df_triplets_train.at[i,2] = temp
print(df_triplets_train)

for i, row in df_triplets_val.iterrows():
  if i % 2 == 1:
    temp = row[1]
    df_triplets_val.at[i,1] = row[2]
    df_triplets_val.at[i,2] = temp
print(df_triplets_val)

          0     1     2
0      2461  3450  2678
1      2299  4987  2499
2      4663  1056  3029
3      4532  1297  1186
4      3454  3809  2204
...     ...   ...   ...
15887   450  4712   219
15888   236  1229  4090
15889   880   621  1818
15890   466  2952  2530
15891  2646  2359  3580

[15892 rows x 3 columns]
         0     1     2
0      647  4571  3011
1      683  2039  2848
2     4404  2384  2478
3     1019  4114  4241
4     4108  1020  1523
...    ...   ...   ...
5000  4020  2390  1285
5001  4990  2647  4028
5002  1716  4431  3398
5003  1550  1140  1222
5004  1940    38  2869

[5005 rows x 3 columns]


In [76]:
df_triplets = pd.concat([df_triplets_train, df_triplets_val])  # Put the two dataframes back on top of each other

df_triplets = df_triplets.reset_index(drop=True)  # Reset index column

print(df_triplets)

          0     1     2
0      2461  3450  2678
1      2299  4987  2499
2      4663  1056  3029
3      4532  1297  1186
4      3454  3809  2204
...     ...   ...   ...
20892  4020  2390  1285
20893  4990  2647  4028
20894  1716  4431  3398
20895  1550  1140  1222
20896  1940    38  2869

[20897 rows x 3 columns]


Generate dataset:

In [84]:
def generate_dataset(df_triplets, df_features):
  #a = np.empty(shape=(df_triplets.shape[0], 9), dtype='object')
  a = np.empty(shape=(df_triplets.shape[0], 3), dtype='object')

  for i in range(3):
    for idx, x in df_triplets[i].iteritems():
      a[idx, i] = df_features.iloc[x, 0]
      #a[idx, 3*i] = df_features.iloc[x, 0]
      #a[idx, 3*i+1] = df_features.iloc[x, 1]
      #a[idx, 3*i+2] = df_features.iloc[x, 2]

  #df_triplets['A_1'], df_triplets['A_2'], df_triplets['A_3'] = [a[:, 0], a[:, 1], a[:, 2]]
  #df_triplets['B_1'], df_triplets['B_2'], df_triplets['B_3'] = [a[:, 3], a[:, 4], a[:, 5]]
  #df_triplets['C_1'], df_triplets['C_2'], df_triplets['C_3'] = [a[:, 6], a[:, 7], a[:, 8]]
  df_triplets['A'] = a[:, 0]
  df_triplets['B'] = a[:, 1]
  df_triplets['C'] = a[:, 2]

  return df_triplets

df_triplets = generate_dataset(df_triplets, df_features)
print(df_triplets)

          0     1     2              A                 B                 C
0      2461  3450  2678      ice_lolly   chocolate_sauce              tray
1      2299  4987  2499  mashed_potato        frying_pan            switch
2      4663  1056  3029         orange         barometer       cauliflower
3      4532  1297  1186        pretzel  hen-of-the-woods           toaster
4      3454  3809  2204          acorn               ant             plate
...     ...   ...   ...            ...               ...               ...
20892  4020  2390  1285          dough         meat_loaf       French_loaf
20893  4990  2647  4028     neck_brace         meat_loaf  hen-of-the-woods
20894  1716  4431  3398      meat_loaf       French_loaf            bolete
20895  1550  1140  1222           corn          bolo_tie         artichoke
20896  1940    38  2869            ear            potpie         guacamole

[20897 rows x 6 columns]


*One*-hot encode:

In [90]:
df_triplets = df_triplets.drop(columns=[0, 1, 2])  # remove original image columns
print(df_triplets)

A    coral_fungus
B           pizza
C          shovel
Name: 15891, dtype: object


In [86]:
df_triplets_oh = pd.get_dummies(data=df_triplets)
print(df_triplets_oh)

       A_African_chameleon  A_African_crocodile  ...  C_zebra  C_zucchini
0                        0                    0  ...        0           0
1                        0                    0  ...        0           0
2                        0                    0  ...        0           0
3                        0                    0  ...        0           0
4                        0                    0  ...        0           0
...                    ...                  ...  ...      ...         ...
20892                    0                    0  ...        0           0
20893                    0                    0  ...        0           0
20894                    0                    0  ...        0           0
20895                    0                    0  ...        0           0
20896                    0                    0  ...        0           0

[20897 rows x 909 columns]


In [93]:
df_triplets_train_oh = df_triplets_oh.iloc[:df_triplets_train.shape[0], :]
df_triplets_val_oh = df_triplets_oh.iloc[df_triplets_train.shape[0]:, :]

print(df_triplets_train_oh)
print(df_triplets_val_oh)

       A_African_chameleon  A_African_crocodile  ...  C_zebra  C_zucchini
0                        0                    0  ...        0           0
1                        0                    0  ...        0           0
2                        0                    0  ...        0           0
3                        0                    0  ...        0           0
4                        0                    0  ...        0           0
...                    ...                  ...  ...      ...         ...
15887                    0                    0  ...        0           0
15888                    0                    0  ...        0           0
15889                    0                    0  ...        0           0
15890                    0                    0  ...        0           0
15891                    0                    0  ...        0           0

[15892 rows x 909 columns]
       A_African_chameleon  A_African_crocodile  ...  C_zebra  C_zucchini
15892     

In [0]:
# Save completed train and validation data sets:
df_triplets_train_oh.to_csv('train', index=False)
df_triplets_val_oh.to_csv('val', index=False)

# Train net:

Load:

In [95]:
x_train = pd.read_csv('train')
print(x_train)

x_val = pd.read_csv('val')
print(x_val)

       A_African_chameleon  A_African_crocodile  ...  C_zebra  C_zucchini
0                        0                    0  ...        0           0
1                        0                    0  ...        0           0
2                        0                    0  ...        0           0
3                        0                    0  ...        0           0
4                        0                    0  ...        0           0
...                    ...                  ...  ...      ...         ...
15887                    0                    0  ...        0           0
15888                    0                    0  ...        0           0
15889                    0                    0  ...        0           0
15890                    0                    0  ...        0           0
15891                    0                    0  ...        0           0

[15892 rows x 909 columns]
      A_African_chameleon  A_African_crocodile  ...  C_zebra  C_zucchini
0          

Generate labels:

In [101]:
y_train = np.empty((x_train.shape[0], 1))
y_train[::2] = 1
y_train[1::2] = 0
print(y_train)

y_val = np.empty((x_val.shape[0], 1))
y_val[::2] = 1
y_val[1::2] = 0
print(y_val)

[[1.]
 [0.]
 [1.]
 ...
 [0.]
 [1.]
 [0.]]
[[1.]
 [0.]
 [1.]
 ...
 [1.]
 [0.]
 [1.]]


Train:

In [106]:
model = Sequential()
model.add(layers.Dense(400, input_dim=x_train.shape[1], activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(400, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train.values, y_train, epochs=30, batch_size=32, shuffle=True, validation_data=(x_val.values, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f0ed2df8588>

Hard: 0.688633839319

Medium: 	0.627444380869

Easy: 	0.578405304433

Evaluate:

In [64]:
eval_results = model.evaluate(x_val, y_val, verbose=0) 
print("\nLoss, accuracy on test data: ")
print("%0.4f %0.2f%%" % (eval_results[0], \
  eval_results[1]*100))

predictions = model.predict_classes(x_val)
print(predictions)


Loss, accuracy on test data: 
2.8106 50.97%
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[[1]
 [0]
 [1]
 ...
 [0]
 [1]
 [0]]


# Predict on test set:

### TODO: this must be done at the same time as the train data, such that test and train data have the exact same features, otherwise it is not compatable to use the model on the test data!!!

Load test triplets:

In [67]:
df_triplets_test = pd.read_csv('test_triplets.txt', sep=" ", header=None)
print(df_triplets_test)

          0     1     2
0      9896  9640  9177
1      6592  9283  7104
2      8655  6174  6400
3      9223  8187  8678
4      7317  5392  9470
...     ...   ...   ...
59539  6113  8042  6277
59540  8851  6075  8549
59541  6299  7843  7940
59542  7652  5620  5416
59543  8475  6082  9044

[59544 rows x 3 columns]


Generate data set with imagenet features:

In [68]:
df_triplets_test = generate_dataset(df_triplets_test, df_features)
df_triplets_test = df_triplets_test.drop(columns=[0, 1, 2])  # remove original image columns
print(df_triplets_test)

               A_1         A_2  ...              C_2               C_3
0      French_loaf      bakery  ...         bolo_tie          envelope
1          burrito       plate  ...     wooden_spoon  butternut_squash
2             chow     pretzel  ...            dough     custard_apple
3            acorn         ear  ...           mortar       toilet_seat
4           potpie       pizza  ...   Dungeness_crab         rock_crab
...            ...         ...  ...              ...               ...
59539  mixing_bowl   ice_cream  ...        meat_loaf     mashed_potato
59540    ice_cream      banana  ...       Petri_dish          bolo_tie
59541       chiton       conch  ...        soup_bowl               bib
59542        plate   guacamole  ...           hotdog   chocolate_sauce
59543       switch  saltshaker  ...  chocolate_sauce            candle

[59544 rows x 9 columns]


One hot:

In [69]:
df_triplets_test_oh = pd.get_dummies(data=df_triplets_test)
print(df_triplets_test_oh)

       A_1_American_lobster  A_1_Band_Aid  ...  C_3_wreck  C_3_zucchini
0                         0             0  ...          0             0
1                         0             0  ...          0             0
2                         0             0  ...          0             0
3                         0             0  ...          0             0
4                         0             0  ...          0             0
...                     ...           ...  ...        ...           ...
59539                     0             0  ...          0             0
59540                     0             0  ...          0             0
59541                     0             0  ...          0             0
59542                     0             0  ...          0             0
59543                     0             0  ...          0             0

[59544 rows x 3067 columns]


Save:

In [0]:
df_triplets_test_oh.to_csv('test', index=False)

Predict:

In [71]:
predictions = model.predict_classes(df_triplets_test_oh.values)
print(predictions)

ValueError: ignored

Save: