In [225]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

sys.path.append("../Library/")
import image_download as imd
import image_manipulation as ima
import db_connection as dbcon
import IO as io
import machine_learning as ml

#%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Parameters

In [58]:
# IMAGE PARAMETERS
SIZE = 1000 # in pixels
BASE_RESOLUTION = 0.3 # in meter
LABELS = [0, 1, 2]

# THESE ARE ONLY APPROXIMATE -->  integer(SIZE/DEGRADED_RESOLUTION)
DEGRADED_RESOLUTIONS = [0.6, 1, 2, 3, 5, 10, 15, 20, 30] # in meter

DEGRADED_SIZES = [SIZE/(res/BASE_RESOLUTION) for res in DEGRADED_RESOLUTIONS]
print("Degraded sizes are", DEGRADED_SIZES, "pixels. This is rounded to the nearest integer!")
# THESE ARE ONLY APPROXIMATE -->  integer(SIZE/DEGRADED_RESOLUTION)

# FOLDER PARAMETERS
GDRIVE_FOLDER = os.getenv('GDRIVE_FOLDER')
RAW_IMAGE_FOLDER = GDRIVE_FOLDER + 'MFP - Satellogic/images/raw_images_usgs_0.3m/'
PROCESSED_IMAGE_FOLDER = RAW_IMAGE_FOLDER + 'processed/'
MFP_IMG_FOLDER = GDRIVE_FOLDER + 'MFP - Satellogic/images/'
CATEGORIES = ['agriculture', 'shrubland-grassland', 'city', 'forest-woodland', 'semi-desert']

# Compute more parameters
params = {'size': SIZE, 'res': BASE_RESOLUTION, 'res_degr': DEGRADED_RESOLUTIONS}
subfolder_size = MFP_IMG_FOLDER + 'usgs_' + str(SIZE) + "/"
subfolder_base_res = subfolder_size + "usgs_" + str(SIZE) + "_" + str(BASE_RESOLUTION) + "m/"

Degraded sizes are [500.0, 300.0, 150.0, 100.0, 59.99999999999999, 29.999999999999996, 20.0, 14.999999999999998, 10.0] pixels. This is rounded to the nearest integer!


### DB connection

In [86]:
db = dbcon.connect("../credentials/mlab_db.txt", "mfp")
images_usgs_col = db["images_lib_usgs"]

DB Credentials from file
DB connected successfully!!!


### Load csv with labelled images

In [87]:
df_labels = pd.DataFrame()
for category in CATEGORIES:
    df_labels = df_labels.append(pd.read_csv(subfolder_base_res + "labels-" + category + ".csv"))                               

### Query GIST

In [134]:
GISTs = ['gist_' + factor for factor in ['0.3', '0_6', '1', '2', '3', '5', '10', '15', '20', '30']]
DATASET = 'usgs_res0.3m_size1000'

query = {
    "$and" : [
        {"dataset":  DATASET},
        {"gist_0_6": {"$exists": True} }
    ]  
}
images_metadata = [img_metadata for img_metadata in images_usgs_col.find(query)]
print(len(images_metadata), "query matches")

2858 query matches


### Create DataFrame from GIST

In [135]:
def create_df_from_metadata_usgs(metadata, columns):
    metadata = [{ column: data[column] for column in columns } for data in metadata]
    df = pd.DataFrame(metadata)
    return df

df_gist = create_df_from_metadata_usgs(images_metadata, ['filename'] + GISTs)

### Join GIST with Labels

In [136]:
df = df_gist.merge(df_labels, on = "filename")

### Prepare data for training classifier 

In [251]:
def df_array(df, gist):
    return np.array([row for row in df[gist]])

def get_cross_val_accuracy(df, classifier, parameters):

    dict_out = {}
    dict_out['gist'] = GISTs
    dict_out['accuracy'] = []
    dict_out['std'] = []
    for gist in GISTs:
        _, _, mean, std = ml.cross_validate_clf(df_array(df, gist), df.label, classifier, parameters)
        dict_out['accuracy'].append(mean)
        dict_out['std'].append(std)
    return pd.DataFrame(dict_out)

def get_confusion_matrix(df, gist_string, test_size, clf, params):
    X_train, X_test, y_train, y_test = train_test_split(df_array(df, gist_string), df.label, test_size = test_size)
    clf = clf(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return metrics.confusion_matrix(y_pred, y_test)
    

### Multi Class Classification

In [255]:
params_svm = {
    'multi_class': "crammer_singer",
    'nfold': 10,
    'C': 1
    }

get_cross_val_accuracy(df, LinearSVC, params_svm)

Unnamed: 0,gist,accuracy,std
0,gist_0.3,0.702662,0.107876
1,gist_0_6,0.576727,0.112886
2,gist_1,0.577579,0.106249
3,gist_2,0.549686,0.121911
4,gist_3,0.548842,0.121216
5,gist_5,0.542087,0.119606
6,gist_10,0.549712,0.09976
7,gist_15,0.559022,0.083039
8,gist_20,0.568327,0.071653
9,gist_30,0.589475,0.043527


In [256]:
# confusion matrix for base resolution
get_confusion_matrix(df, 'gist_0.3', 0.3, LinearSVC, params_svm)

array([[ 52,  10,   4],
       [  0,   0,   0],
       [ 38,  34, 218]])

### Binary Classification encoding 1 as 0

In [268]:
def zero_encoding(x):
    if x == 0: return 0
    if x == 1: return 0
    if x == 2: return 1
    
def two_encoding(x):
    if x == 0: return 0
    if x == 1: return 1
    if x == 2: return 1
    
def convert_encoding(y, encoding):
    y = [encoding(el) for el in y]
    return y
    
X0 = df.copy()
X0.label = convert_encoding(X0.label, zero_encoding)

params_svm = {
    'nfold': 10,
    'C': 1 
}

get_cross_val_accuracy(X0, LinearSVC, params_svm)

Unnamed: 0,gist,accuracy,std
0,gist_0.3,0.746756,0.225347
1,gist_0_6,0.624852,0.13189
2,gist_1,0.629915,0.136681
3,gist_2,0.645974,0.125334
4,gist_3,0.64092,0.112455
5,gist_5,0.635055,0.081383
6,gist_10,0.613134,0.055843
7,gist_15,0.622427,0.041066
8,gist_20,0.628349,0.036265
9,gist_30,0.633404,0.04421


In [273]:
# confusion matrix for base resolution
get_confusion_matrix(X0, 'gist_0.3', 0.2, LinearSVC, params_svm)

array([[ 47,  17],
       [ 34, 139]])

In [274]:
# confusion matrix for base resolution
get_confusion_matrix(X0, 'gist_0_6', 0.2, LinearSVC, params_svm)

array([[ 60,  18],
       [ 39, 120]])

### Binary Classification encoding 1 as 2

In [275]:
X2 = df.copy()
X2.label = convert_encoding(X2.label, two_encoding)

params_svm = {
    'nfold': 10,
    'C': 1 
}

get_cross_val_accuracy(X2, LinearSVC, params_svm)

Unnamed: 0,gist,accuracy,std
0,gist_0.3,0.805291,0.145574
1,gist_0_6,0.76935,0.066025
2,gist_1,0.776947,0.068394
3,gist_2,0.713577,0.088262
4,gist_3,0.700056,0.094889
5,gist_5,0.709366,0.078161
6,gist_10,0.716145,0.059523
7,gist_15,0.710227,0.064311
8,gist_20,0.718685,0.054171
9,gist_30,0.717837,0.05713


In [276]:
# confusion matrix for base resolution
get_confusion_matrix(X2, 'gist_0.3', 0.2, LinearSVC, params_svm)

array([[ 23,  15],
       [ 29, 170]])

In [277]:
# confusion matrix for base resolution
get_confusion_matrix(X2, 'gist_0_6', 0.2, LinearSVC, params_svm)

array([[ 34,  10],
       [ 25, 168]])