# SVD Python


In this NoteBook the reader finds code to read a GeoTiff file, single- or multi-band, from HDFS. It reads the GeoTiff as a **ByteArray** and then stores the GeoTiff in memory using **MemFile** from **RasterIO** python package. Then scipy is used to determine the SVD of a matrix multiplication between two phenology products.

With this example the user can load GeoTiffs from HDFS and then explore all the features of Python packages such as [rasterio](https://github.com/mapbox/rasterio).

## Dependencies

In [1]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")

#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"

#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext

#from osgeo import gdal
#To read GeoTiffs as a ByteArray
from io import BytesIO
from rasterio.io import MemoryFile

import numpy as np
import pandas
import datetime
import matplotlib.pyplot as plt
import rasterio
from rasterio import plot
from os import listdir
from os.path import isfile, join
from numpy import exp, log
from numpy.random import standard_normal
from scipy.linalg import norm, qr, svd
#from lowrankproduct import lowrankproduct
from sklearn.utils.extmath import randomized_svd

## Connect to Spark

In [2]:
appName = "plot_GeoTiff"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"

#A context needs to be created if it does not already exist
try:
    sc.stop()
except NameError:
    print("A new Spark Context will be created.")
    
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))

A new Spark Context will be created.


# Configuration

In [3]:
dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"

## Read GeoTiffs

In [None]:
def getDataSet(directoryPath):
    files = sc.binaryFiles(directoryPath)
    dataArray = []
    for f in files.keys().collect():
        data = files.lookup(f)
        dataByteArray = bytearray(data[0])
        memfile = MemoryFile(dataByteArray)
        dataset = memfile.open()
        #relevantBand = np.uint8(dataset.read()[0])
        relevantBand = np.array(dataset.read()[0])
        memfile.close()
        #print("relevantBand.shape: " + str(relevantBand.shape))
        flattenedDataSet = relevantBand.flatten()
        #print("flattenedDataSet.shape: " + str(flattenedDataSet.shape))
        dataArray.append(flattenedDataSet)
    
    #Pandas appends a vectors as a column to a DataFrame
    dataSet = pandas.DataFrame(dataArray).T
    print(dataSet.shape)
    maxDimension = max(dataSet.shape)
    minDimension = min(dataSet.shape)
    dataSetWithIndex = dataSet.reset_index()
    dataSetWithoutNan = dataSetWithIndex.dropna(axis = 0, thresh = minDimension)
    dataSetIndex = dataSetWithoutNan.index
    dataSetWithoutIndex = np.array(dataSetWithoutNan.drop("index", axis = 1))
    return dataSetWithoutIndex, dataSetIndex, maxDimension

In [None]:
dataSet1, dataSetIndex1, maxDimension1 = getDataSet(dataDirectory1)
dataSet2, dataSetIndex2, maxDimension2 = getDataSet(dataDirectory2)
print("dataSet1.shape: " + str(dataSet1.shape))
print("dataSet2.shape: " + str(dataSet2.shape))

# SVD

In [None]:
fullProduct = dataSet1 @ dataSet2.T

In [None]:
fullProduct.shape

In [None]:
minDimension = min(min(dataSet1.shape), min(dataSet2.shape))
randU, randS, randVt = randomized_svd(fullProduct, n_components=minDimension)

In [None]:
#normU, normS, normVt = svd(fullProduct, full_matrices = False)
normU, normS, normVt = svd(fullProduct, full_matrices = True)