In [80]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.svm import LinearSVC
from sklearn import metrics

sys.path.append("../Library/")
import image_download as imd
import image_manipulation as ima
import db_connection as dbcon
import IO as io

#%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Parameters

In [58]:
# IMAGE PARAMETERS
SIZE = 1000 # in pixels
BASE_RESOLUTION = 0.3 # in meter
LABELS = [0, 1, 2]

# THESE ARE ONLY APPROXIMATE -->  integer(SIZE/DEGRADED_RESOLUTION)
DEGRADED_RESOLUTIONS = [0.6, 1, 2, 3, 5, 10, 15, 20, 30] # in meter

DEGRADED_SIZES = [SIZE/(res/BASE_RESOLUTION) for res in DEGRADED_RESOLUTIONS]
print("Degraded sizes are", DEGRADED_SIZES, "pixels. This is rounded to the nearest integer!")
# THESE ARE ONLY APPROXIMATE -->  integer(SIZE/DEGRADED_RESOLUTION)

# FOLDER PARAMETERS
GDRIVE_FOLDER = os.getenv('GDRIVE_FOLDER')
RAW_IMAGE_FOLDER = GDRIVE_FOLDER + 'MFP - Satellogic/images/raw_images_usgs_0.3m/'
PROCESSED_IMAGE_FOLDER = RAW_IMAGE_FOLDER + 'processed/'
MFP_IMG_FOLDER = GDRIVE_FOLDER + 'MFP - Satellogic/images/'
CATEGORIES = ['agriculture', 'shrubland-grassland', 'city', 'forest-woodland', 'semi-desert']

# Compute more parameters
params = {'size': SIZE, 'res': BASE_RESOLUTION, 'res_degr': DEGRADED_RESOLUTIONS}
subfolder_size = MFP_IMG_FOLDER + 'usgs_' + str(SIZE) + "/"
subfolder_base_res = subfolder_size + "usgs_" + str(SIZE) + "_" + str(BASE_RESOLUTION) + "m/"

Degraded sizes are [500.0, 300.0, 150.0, 100.0, 59.99999999999999, 29.999999999999996, 20.0, 14.999999999999998, 10.0] pixels. This is rounded to the nearest integer!


### DB connection

In [86]:
db = dbcon.connect("../credentials/mlab_db.txt", "mfp")
images_usgs_col = db["images_lib_usgs"]

DB Credentials from file
DB connected successfully!!!


### Load csv with labelled images

In [87]:
df_labels = pd.DataFrame()
for category in CATEGORIES:
    df_labels = df_labels.append(pd.read_csv(subfolder_base_res + "labels-" + category + ".csv"))                               

### Query GIST

In [134]:
GISTs = ['gist_' + factor for factor in ['0.3', '0_6', '1', '2', '3', '5', '10', '15', '20', '30']]
DATASET = 'usgs_res0.3m_size1000'

query = {
    "$and" : [
        {"dataset":  DATASET},
        {"gist_0_6": {"$exists": True} }
    ]  
}
images_metadata = [img_metadata for img_metadata in images_usgs_col.find(query)]
print(len(images_metadata), "query matches")

2858 query matches


### Create DataFrame from GIST

In [135]:
def create_df_from_metadata_usgs(metadata, columns):
    metadata = [{ column: data[column] for column in columns } for data in metadata]
    df = pd.DataFrame(metadata)
    return df

df_gist = create_df_from_metadata_usgs(images_metadata, ['filename'] + GISTs)

### Join GIST with Labels

In [136]:
df = df_gist.merge(df_labels, on = "filename")

def df_array(df, gist):
    return np.array([row for row in df[gist]])

### Classification

In [196]:
X_train, X_test, y_train, y_test = train_test_split(df_array(df,'gist_10'), df.label, test_size = 0.2)

In [197]:
clf = LinearSVC(multi_class="crammer_singer")# for multi class labels use multi_class="crammer_singer"

In [198]:
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)

In [199]:
y_pred = clf.predict(X_test)

In [200]:
pd.DataFrame({
        'y_pred': y_pred.tolist(),
        'y_test': y_test
})

metrics.confusion_matrix(y_pred, y_test)

array([[  2,   4,   0],
       [  0,   0,   0],
       [ 51,  32, 148]])

In [201]:
metrics.accuracy_score(y_pred, y_test)

0.6329113924050633