# Clustering and Viz Experimentation

In [1]:
import rasterio
import numpy as np
from rasterio.plot import adjust_band
import matplotlib.pyplot as plt
from rasterio.plot import reshape_as_raster, reshape_as_image
from rasterio.plot import show
from rasterio.windows import Window
from pyproj import Proj, transform
import random
import math
import itertools
import os
import sys

module_path = os.path.abspath(os.path.join('rcnn/'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utilities as util
import importlib
import rnn_tiles
import rnn_pixels

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import sklearn.metrics as metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso

In [2]:
importlib.reload(rnn_pixels)
importlib.reload(rnn_tiles)
importlib.reload(util)

<module 'utilities' from '/host/Code/florence_mapping/utilities.py'>

### importing datasets

In [3]:
lc_labels = rasterio.open('/deep_data/recurrent_data/NLCD_DATA/landcover/NLCD_2011_Land_Cover_L48_20190424.img')
canopy_labels = rasterio.open('/deep_data/recurrent_data/NLCD_DATA/canopy/CONUSCartographic_2_8_16/Cartographic/nlcd2011_usfs_conus_canopy_cartographic.img')

class_dict = util.indexed_dictionary

tiles = {}
landsat_datasets = {}
tiles['028012'] = ['20110324', '20110612', '20110831', '20111103']
tiles['029011'] = ['20110308', '20110425', '20110831', '20111103']
tiles['028011'] = ['20110308', '20110628', '20110831', '20111103']
#tiles['028012'] = ['20110831']
#tiles['029011'] = ['20110831']
#tiles['028011'] = ['20110831']
for tile_number, dates in tiles.items():
    tile_datasets = []
    l8_image_paths = []
    for date in dates:
        l8_image_paths.append('/deep_data/recurrent_data/tile{}/combined/combined{}.tif'.format(tile_number, date))
    for fp in l8_image_paths:
        tile_datasets.append(rasterio.open(fp))
    landsat_datasets[tile_number] = tile_datasets
    
tile_size = 5
tile_list = ['028012', '029011', '028011']
class_count = len(class_dict)
clean_pixels_count = 3000000
max_count_per_class = 1001

### Training Data 

In [None]:
sk_data, sk_labels, class_count_dict = rnn_pixels.balanced_pix_data(landsat_datasets, lc_labels, canopy_labels, tile_size, tile_list, 
                           clean_pixels_count, class_count, max_count_per_class, class_dict, buffer_pix=1)

sk_data.shape, sk_labels.shape, class_count_dict

Beginning balanced data creation.


### Testing Data

In [None]:
def sk_train_test_split(max_per_test_class, max_per_train_class, sk_data, sk_labels):

    # Create Test and Train Count Dictionary
    test_class_count_dict = {}
    train_class_count_dict = {}
    for key in class_dict:
        test_class_count_dict[key] = 0 
        train_class_count_dict[key] = 0

    # Training 
    sk_data_test = []
    sk_labels_test = []

    #Testing
    sk_data_train = []
    sk_labels_train = []

    for x in range(len(sk_data)):
        pop_data = sk_data[x]
        pop_label = sk_labels[x]
        pop_class = pop_label[0]
        if test_class_count_dict[pop_class] < max_per_test_class:
            sk_data_test.append(pop_data)
            sk_labels_test.append(pop_label)
            test_class_count_dict[pop_class] += 1
        elif train_class_count_dict[pop_class] < max_per_train_class:
            sk_data_train.append(pop_data)
            sk_labels_train.append(pop_label)
            train_class_count_dict[pop_class] += 1

    return(np.array(sk_data_train),np.array(sk_labels_train), np.array(sk_data_test), np.array(sk_labels_test),train_class_count_dict,test_class_count_dict)



In [None]:
max_per_test_class = 100
max_per_train_class = 300

sk_data_train, sk_labels_train, sk_data_test, sk_labels_test, train_class_count_dict,test_class_count_dict = sk_train_test_split(max_per_test_class, max_per_train_class, sk_data, sk_labels)


print(test_class_count_dict)
print(train_class_count_dict)
print(sk_data_train.shape)
print(sk_data_test.shape)
print(sk_labels_train.shape)
print(sk_labels_test.shape)


In [None]:
class_dict

## Testing Input Size

In [None]:

clean_pixels_count_train = 2000000

max_count_per = []
scores = []
for x in range(10,900,10):
    max_count_per.append(x)
    
    # Split
    max_per_test_class = 100
    max_per_train_class = x

    sk_data_train, sk_labels_train, sk_data_test, sk_labels_test, train_class_count_dict,test_class_count_dict = sk_train_test_split(max_per_test_class, max_per_train_class, sk_data, sk_labels)

    
    # Data
    landcover_train = sk_labels_train[:,0]
    canopy_train = sk_labels_train[:,1]
    tiles_train = sk_data_train
    tiles_test = sk_data_test
    landcover_test = sk_labels_test[:,0]
    canopy_test = sk_labels_test[:,1]
    
    # Model
    clf = svm.SVC()
    clf.fit(tiles_train,landcover_train.astype('int'))
    pred_clf = clf.predict(tiles_test)
    scores.append(accuracy_score(landcover_test.astype('int'),pred_clf))

In [None]:
%matplotlib inline 
plt.figure(1);
plt.plot(max_count_per,scores)
plt.xlabel('Size of Test Data')
plt.ylabel('Testing Accuracy')
plt.title('0 to 1000')

In [None]:
# Data Prep
landcover_train = sk_labels_train[:,0]
canopy_train = sk_labels_train[:,1]
tiles_train = sk_data_train

landcover_test = sk_labels_test[:,0]
canopy_test = sk_labels_test[:,1]
tiles_test = sk_data_test



## SVM (Landcover)

In [None]:
#SVM
clf = svm.SVC()
clf.fit(tiles_train,landcover_train.astype('int'))
pred_clf = clf.predict(tiles_test)
print('Classification Report')
print(classification_report(landcover_test.astype('int'), pred_clf))
print('Confusion Matrix')
print(confusion_matrix(landcover_test.astype('int'), pred_clf))

## RFC (Landcover)

In [None]:
#RFC
rfc = RandomForestClassifier(n_estimators = 350)
rfc.fit(tiles_train,landcover_train.astype('int'))
pred_rfc = rfc.predict(tiles_test)
print('Classification Report')
print(classification_report(landcover_test.astype('int'), pred_rfc))
print('Confusion Matrix')
print(confusion_matrix(landcover_test.astype('int'), pred_rfc))


## KNN (Landcover)

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 23)
knn.fit(tiles_train,landcover_train.astype('int'))
pred_knn = knn.predict(tiles_test)
print('Classification Report')
print(classification_report(landcover_test.astype('int'), pred_knn))
print('Confusion Matrix')
print(confusion_matrix(landcover_test.astype('int'), pred_knn))

## Linear Regression (Canopy)

In [None]:
LR = LinearRegression()
LR.fit(tiles_train,canopy_train)
predLR = LR.predict(tiles_test)

print('Mean Absolute Error:',metrics.mean_absolute_error(canopy_test, predLR))
print('Mean Squared Error:',metrics.mean_squared_error(canopy_test, predLR))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(canopy_test,predLR)))

%matplotlib inline 
plt.figure(1);
plt.scatter(canopy_test, predLR, alpha = .1)
plt.plot(canopy_test, canopy_test,color = 'r')
plt.xlabel('Canopy Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression')

## Lasso (Canopy)

In [None]:
las= Lasso(alpha = 0.05)
las.fit(tiles_train,canopy_train)
predLas = las.predict(tiles_test)


print('Mean Absolute Error:',metrics.mean_absolute_error(canopy_test, predLas))
print('Mean Squared Error:',metrics.mean_squared_error(canopy_test, predLas))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(canopy_test, predLas)))

%matplotlib inline 
plt.figure(1);
plt.scatter(canopy_test, predLas)
plt.plot(canopy_test, canopy_test,color = 'r')
plt.xlabel('Canopy Values')
plt.ylabel('Predicted Values')
plt.title('Lasso')

## Random Forest Regression

In [None]:
rfr = RandomForestRegressor()
rfr.fit(tiles_train,canopy_train)
pred_rfr = rfr.predict(tiles_test)

print('Mean Absolute Error:',metrics.mean_absolute_error(canopy_test, pred_rfr))
print('Mean Squared Error:',metrics.mean_squared_error(canopy_test, pred_rfr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(canopy_test,pred_rfr)))

%matplotlib inline 
plt.figure(1);
plt.scatter(canopy_test, pred_rfr, alpha = .1)
plt.plot(canopy_test, canopy_test,color = 'r')
plt.xlabel('Canopy Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest Regression')

## Spectral Signatures

Landsat 5 specs from USGS https://www.usgs.gov/land-resources/nli/landsat/landsat-5

In [None]:
fig, ax = plt.subplots(1,1, figsize=[8,8])

# numbers 1-8
band_count = np.arange(1,29)

for class_index in class_dict:
    band_intensity = np.mean(sk_data[sk_labels==class_index, :], axis=0)
    ax.plot(band_count, band_intensity, label=class_dict[class_index])
# plot them as lines

# Add some axis labels
ax.set_xlabel('Band #')
ax.set_ylabel('Reflectance Value')
# Add a title
ax.set_title('Band Intensities Full Overview')
ax.legend(loc='upper left')


In [None]:
sk_data.reshape(-1,4,7).shape

In [None]:
fig, ax = plt.subplots(1,1, figsize=[8,8])

# numbers 1-8
band_count = np.arange(1,8)

for class_index in class_dict:
    # reshape into 4 time steps of 
    time_steps = len(tiles['028012'])
    band_intensity = np.mean(sk_data[sk_labels==class_index, :].reshape(-1,time_steps,7), axis=(0,1))
    ax.plot(band_count, band_intensity, label=class_dict[class_index])
# plot them as lines

# Add some axis labels
ax.set_xlabel('Band #')
ax.set_ylabel('Reflectance Value')
# Add a title
ax.set_title('Band Intensities Full Overview')
ax.legend(loc='upper left')


### Dendrogram

In [None]:
from scipy.cluster import hierarchy

image_avgs = []    
for class_index in class_dict:
    image_avgs.append(np.mean(sk_data[sk_labels==class_index, :], axis=0))

ytdist = np.array(image_avgs)

Z = hierarchy.linkage(ytdist, 'single')
plt.figure(figsize=(10,10))
dn = hierarchy.dendrogram(Z, labels=list(class_dict.values()))

### PCA and Clustering

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(n_components=3)
pca_result = pca.fit_transform(sk_data)

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

df = pd.DataFrame({'pca-one':pca_result[:,0],'pca-two':pca_result[:,1],'pca-three':pca_result[:,2], 'y' : sk_labels})

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", len(np.unique(sk_labels))),
    data=df,
    legend="full",
    alpha=0.3
)

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(
    xs=df["pca-one"], 
    ys=df["pca-two"], 
    zs=df["pca-three"], 
    c=df["y"], 
    cmap='tab10'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

In [None]:
from time import time
from sklearn.manifold import TSNE

N = 10000
data_subset = sk_data[:N, :]

time_start = time()
tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=1000)
tsne_results = tsne.fit_transform(data_subset)
print('t-SNE done! Time elapsed: {} seconds'.format(time()-time_start))

In [None]:
df_subset = df.loc[:N-1,:].copy()
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", len(np.unique(sk_labels))),
    data=df_subset,
    legend="full",
    alpha=0.3
)