In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LinearRegression

from sklearn import metrics

sys.path.append("../Library/")
import image_download as imd
import image_manipulation as ima
import db_connection as dbcon

#%matplotlib inline
%load_ext autoreload
%autoreload 2

  from numpy.core.umath_tests import inner1d


In [2]:
# parameters
DATASET = 'gmaps'
IMG_FOLDER = os.getenv('MFP_IMG_FOLDER') + DATASET + "/"


# connect to db
db = dbcon.connect("../credentials/mlab_db.txt","mfp")
images_lib_col = db["images_lib"]

DB Credentials from file
DB connected successfully!!!


## Load labelled images with GIST computed

In [3]:
query = {
    "$and" : [
        {"dataset":DATASET },
        {"gist": {"$exists": True} },
        {"label_multi_er": {"$exists": True} }
    ]  
}
images_metadata = [img_metadata for img_metadata in images_lib_col.find(query)]
print(len(images_metadata), "query matches")

512 query matches


In [4]:
all_metadata = pd.DataFrame(images_metadata)
print(all_metadata.shape)
all_metadata.head()

(512, 19)


Unnamed: 0,_id,comments,dataset,filename,gist,img_size,label,label_binary_er,label_binary_pw,label_multi_er,label_multi_pw,lat,lon,meters_per_px,name,pixels,saved_dt,url,zoom
0,5c883dda5015d5e57d590bd2,,gmaps,Andorra la Vella_42.5588_1.5956_13_600.png,"[0.0423380583524704, 0.06505296379327774, 0.07...",8445.33947,,1,,6,,42.5588,1.5956,14.075566,Andorra la Vella,600,2019-03-13 00:16:42.553,https://maps.google.com/maps/api/staticmap?key...,13
1,5c883ddb5015d5e57d590bd7,,gmaps,Andorra la Vella_42.5588_1.5956_14_600.png,"[0.05163280665874481, 0.06631343066692352, 0.0...",4222.669735,,1,,7,,42.5588,1.5956,7.037783,Andorra la Vella,600,2019-03-13 00:16:43.001,https://maps.google.com/maps/api/staticmap?key...,14
2,5c883ddb5015d5e57d590bda,,gmaps,Andorra la Vella_42.5588_1.5956_15_600.png,"[0.07334117591381073, 0.06448617577552795, 0.0...",2111.334867,,1,,7,,42.5588,1.5956,3.518891,Andorra la Vella,600,2019-03-13 00:16:43.476,https://maps.google.com/maps/api/staticmap?key...,15
3,5c883ddc5015d5e57d590bdd,,gmaps,Andorra la Vella_42.5588_1.5956_16_600.png,"[0.07424785196781158, 0.07135545462369919, 0.0...",1055.667434,,1,,7,,42.5588,1.5956,1.759446,Andorra la Vella,600,2019-03-13 00:16:43.912,https://maps.google.com/maps/api/staticmap?key...,16
4,5c744a7b3603900f5521fd73,,gmaps,Barcelona_41.45181606138525_2.0916025645537917...,"[0.10248155891895294, 0.09609375149011612, 0.1...",2291.620933,,1,,8,,41.451816,2.091603,3.580658,Barcelona,640,2019-02-25 21:05:15.113,https://maps.google.com/maps/api/staticmap?key...,15


In [5]:
all_metadata = all_metadata[all_metadata["label_multi_er"] != '10'].copy()
print(all_metadata.shape)
all_metadata.head()

(501, 19)


Unnamed: 0,_id,comments,dataset,filename,gist,img_size,label,label_binary_er,label_binary_pw,label_multi_er,label_multi_pw,lat,lon,meters_per_px,name,pixels,saved_dt,url,zoom
0,5c883dda5015d5e57d590bd2,,gmaps,Andorra la Vella_42.5588_1.5956_13_600.png,"[0.0423380583524704, 0.06505296379327774, 0.07...",8445.33947,,1,,6,,42.5588,1.5956,14.075566,Andorra la Vella,600,2019-03-13 00:16:42.553,https://maps.google.com/maps/api/staticmap?key...,13
1,5c883ddb5015d5e57d590bd7,,gmaps,Andorra la Vella_42.5588_1.5956_14_600.png,"[0.05163280665874481, 0.06631343066692352, 0.0...",4222.669735,,1,,7,,42.5588,1.5956,7.037783,Andorra la Vella,600,2019-03-13 00:16:43.001,https://maps.google.com/maps/api/staticmap?key...,14
2,5c883ddb5015d5e57d590bda,,gmaps,Andorra la Vella_42.5588_1.5956_15_600.png,"[0.07334117591381073, 0.06448617577552795, 0.0...",2111.334867,,1,,7,,42.5588,1.5956,3.518891,Andorra la Vella,600,2019-03-13 00:16:43.476,https://maps.google.com/maps/api/staticmap?key...,15
3,5c883ddc5015d5e57d590bdd,,gmaps,Andorra la Vella_42.5588_1.5956_16_600.png,"[0.07424785196781158, 0.07135545462369919, 0.0...",1055.667434,,1,,7,,42.5588,1.5956,1.759446,Andorra la Vella,600,2019-03-13 00:16:43.912,https://maps.google.com/maps/api/staticmap?key...,16
4,5c744a7b3603900f5521fd73,,gmaps,Barcelona_41.45181606138525_2.0916025645537917...,"[0.10248155891895294, 0.09609375149011612, 0.1...",2291.620933,,1,,8,,41.451816,2.091603,3.580658,Barcelona,640,2019-02-25 21:05:15.113,https://maps.google.com/maps/api/staticmap?key...,15


In [6]:
X = pd.DataFrame(
    all_metadata["gist"].values.tolist(), 
    index=all_metadata["filename"]
)
print(X.shape)
X.head()

(501, 960)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,950,951,952,953,954,955,956,957,958,959
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Andorra la Vella_42.5588_1.5956_13_600.png,0.042338,0.065053,0.076905,0.067586,0.025289,0.059068,0.073122,0.072113,0.058878,0.061868,...,0.073852,0.06666,0.071045,0.069527,0.077991,0.064831,0.076635,0.077642,0.065814,0.057197
Andorra la Vella_42.5588_1.5956_14_600.png,0.051633,0.066313,0.061655,0.074285,0.06634,0.06687,0.084149,0.071402,0.065488,0.075183,...,0.079491,0.076265,0.082247,0.08305,0.060888,0.071314,0.06036,0.057672,0.067417,0.080375
Andorra la Vella_42.5588_1.5956_15_600.png,0.073341,0.064486,0.09378,0.082437,0.067744,0.07007,0.093607,0.080887,0.07211,0.056294,...,0.084885,0.064084,0.098096,0.076448,0.058322,0.070702,0.077405,0.081924,0.054292,0.053267
Andorra la Vella_42.5588_1.5956_16_600.png,0.074248,0.071355,0.073486,0.082614,0.047767,0.05261,0.078055,0.09317,0.037689,0.047594,...,0.084673,0.080415,0.070755,0.068472,0.08658,0.051285,0.100472,0.076807,0.074039,0.046782
Barcelona_41.45181606138525_2.0916025645537917_15_640.png,0.102482,0.096094,0.107501,0.115606,0.120622,0.12905,0.156943,0.134135,0.129487,0.151448,...,0.072872,0.099454,0.085735,0.062055,0.075148,0.086189,0.08106,0.073832,0.070125,0.079958


In [7]:
y = all_metadata[["label_multi_er"]]
print(y.shape)
y.head()

(501, 1)


Unnamed: 0,label_multi_er
0,6
1,7
2,7
3,7
4,8


In [8]:
y.label_multi_er.value_counts()

9    209
8     98
0     50
7     42
6     25
2     20
5     18
3     15
4     15
1      9
Name: label_multi_er, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y["label_multi_er"], test_size=0.3, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(350, 960) (350,)
(151, 960) (151,)


In [12]:
#reg = LinearRegression()
#reg.fit(X_train,y_train)
#reg_predict = reg.predict(X_test)
#print("LinearRegressionClassifier:", metrics.accuracy_score(reg_predict, y_test))

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_predict = knn.predict(X_test)
print("KNeighborsClassifier:", metrics.accuracy_score(knn_predict, y_test))
print(metrics.confusion_matrix(y_test, knn_predict))
print()

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf_predict = rf.predict(X_test)
print("RandomForestClassifier:", metrics.accuracy_score(rf_predict, y_test))
print(metrics.confusion_matrix(y_test, rf_predict))
print()

ab = AdaBoostClassifier()
ab.fit(X_train,y_train)
ab_predict = ab.predict(X_test)
print("AdaBoostClassifier:", metrics.accuracy_score(ab_predict, y_test))
print(metrics.confusion_matrix(y_test, ab_predict))
print()

sgd = SGDClassifier(max_iter=20, tol=None)
sgd.fit(X_train,y_train)
sgd_predict = sgd.predict(X_test)
print("SGDClassifier:", metrics.accuracy_score(sgd_predict, y_test))
print(metrics.confusion_matrix(y_test, sgd_predict))
print()

print("Done")

KNeighborsClassifier: 0.4370860927152318
[[13  0  0  0  0  0  0  0  4  1]
 [ 0  3  0  0  0  0  0  1  0  0]
 [ 2  0  1  1  0  0  0  0  1  4]
 [ 0  1  0  0  0  0  0  1  2  0]
 [ 0  0  0  0  0  0  0  0  0  4]
 [ 1  0  0  1  1  0  0  0  2  1]
 [ 0  0  0  1  0  1  0  1  3  0]
 [ 1  0  0  0  0  1  0  2  3  3]
 [ 0  1  0  0  1  1  1  4  6 20]
 [ 1  0  0  0  1  0  0  1 12 41]]

RandomForestClassifier: 0.423841059602649
[[12  0  0  0  1  0  1  0  1  3]
 [ 1  0  1  0  0  0  0  1  0  1]
 [ 2  0  0  0  2  0  1  0  3  1]
 [ 2  0  0  0  0  0  0  1  0  1]
 [ 1  0  0  1  0  0  0  0  2  0]
 [ 0  1  0  0  0  0  0  2  1  2]
 [ 1  0  0  0  0  0  2  1  0  2]
 [ 0  0  0  0  0  0  0  2  2  6]
 [ 2  0  0  0  1  2  2  2  6 19]
 [ 1  0  0  0  0  0  1  2 10 42]]

AdaBoostClassifier: 0.2052980132450331
[[ 0  0  0 10  0  1  0  0  2  5]
 [ 0  0  0  0  0  0  0  0  1  3]
 [ 1  0  0  1  1  1  0  0  2  3]
 [ 0  0  0  1  0  1  0  0  0  2]
 [ 1  0  0  1  0  0  1  0  0  1]
 [ 1  0  0  0  0  0  0  0  0  5]
 [ 0  0  0  1  0