# Product SVD in Python


In this NoteBook the reader finds code to read a GeoTiff file, single- or multi-band, from HDFS. It reads the GeoTiff as a **ByteArray** and then stores the GeoTiff in memory using **MemFile** from **RasterIO** python package. Subsequently, the Python module _productsvd_ is used to determine the SVD of two phenology datasets.

## Dependencies

In [None]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
sys.path.append("/data/local/jupyterhub/modules/python")

#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"

import subprocess

#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext
from hdfs import InsecureClient
from tempfile import TemporaryFile

#from osgeo import gdal
#To read GeoTiffs as a ByteArray
from io import BytesIO
from rasterio.io import MemoryFile

import numpy as np
import pandas
import datetime
import matplotlib.pyplot as plt
import rasterio
from rasterio import plot
from os import listdir
from os.path import isfile, join
from numpy import exp, log
from numpy.random import standard_normal
from scipy.linalg import norm, qr, svd
from productsvd import qrproductsvd
from sklearn.utils.extmath import randomized_svd

## Configuration

In [None]:
debugMode = True

## Connect to Spark

In [None]:
appName = "plot_GeoTiff"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"

#A context needs to be created if it does not already exist
try:
    sc.stop()
except NameError:
    print("A new Spark Context will be created.")

sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
conf = sc.getConf()

## Support functions

In [None]:
def dprint(msg):
    if (debugMode):
        print(msg)

In [None]:
def get_hdfs_client():
    return InsecureClient("pheno0.phenovari-utwente.surf-hosted.nl:50070", user="pheno",
         root="/")

In [None]:
def progressBar(message, value, endvalue, bar_length = 20):
    if (debugMode):
        percent = float(value) / endvalue
        arrow = '-' * int(round(percent * bar_length)-1) + '>'
        spaces = ' ' * (bar_length - len(arrow))
        sys.stdout.write("\r" + message + ": [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
        if value == endvalue:
            sys.stdout.write("\n")
        sys.stdout.flush()

In [None]:
def getDataSet(directoryPath):
    dprint("-------------------------------")
    dprint("Running getDataSet(directoryPath)")
    dprint("Start time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")
    dprint("directoryPath: " + directoryPath)
    dprint("-------------------------------")
    files = sc.binaryFiles(directoryPath + "/*.tif")
    fileList = files.keys().collect()
    dprint("Number of files: " + str(len(fileList)))
    dataArray = []
    plotShapes = []
    flattenedShapes = []
    for i, f in enumerate(fileList):
        progressBar("Reading files", i + 1, len(fileList))
        data = files.lookup(f)
        dataByteArray = bytearray(data[0])
        memfile = MemoryFile(dataByteArray)
        dataset = memfile.open()
        relevantBand = np.array(dataset.read()[0])
        memfile.close()
        plotShapes.append(relevantBand.shape)
        flattenedDataSet = relevantBand.flatten()
        flattenedShapes.append(flattenedDataSet.shape)
        dataArray.append(flattenedDataSet)
    #Pandas appends a vectors as a column to a DataFrame
    # Check if plotShapes & flattenedShapes all equal
    dataSet = pandas.DataFrame(dataArray).T
    maxDimension = max(dataSet.shape)
    minDimension = min(dataSet.shape)
    dataSetWithIndex = dataSet.reset_index()
    dataSetWithoutNan = dataSetWithIndex.dropna(axis = 0, thresh = minDimension)
    dataSetIndex = dataSetWithoutNan.index
    dataSetWithoutIndex = np.array(dataSetWithoutNan.drop("index", axis = 1))
    dprint("-------------------------------")
    dprint("End time: " + str(datetime.datetime.now()))
    dprint("Ending getDataSet(directoryPath)")
    dprint("-------------------------------")
    return dataSetWithoutIndex, dataSetIndex, maxDimension, plotShapes[0]

In [None]:
def normDifferenceUpToSign(vector1, vector2): # Necesarry because algorithm sometimes gives back the negative of the expected result
    normDifference = norm(vector1 - vector2)
    if normDifference > 1:
            normDifference = norm(vector1 + vector2)
    return normDifference

In [None]:
def validateNorms(dataSet1, dataSet2, U, s, V):
    length = len(s)
    norms = []
    for i in range(length):
        progressBar("Validating norms", i + 1, length)
        u = dataSet1 @ (dataSet2.T @ V.T[i]) / s[i]
        v = dataSet2 @ (dataSet1.T @ U.T[i]) / s[i]
        norms.append(normDifferenceUpToSign(U.T[i], u))
        norms.append(normDifferenceUpToSign(V.T[i], v))
    return max(norms) < 10^-10

In [None]:
def writeMode(resultDir, fileName, i, U, s, V): 
    inFile = "/tmp/" + fileName
    outFile = resultDir + fileName
    
    decompositionFile = open(inFile, "w")
    U.T[i].tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    decompositionFile = open(inFile, "a")
    decompositionFile.write("\n")
    s[i].tofile(decompositionFile, sep = ",")
    decompositionFile.write("\n")
    V.T[i].tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    
    #Upload to HDFS
    subprocess.run(['hadoop', 'dfs', '-copyFromLocal', '-f', inFile, outFile])  

    #Remove from /tmp/
    subprocess.run(['rm', '-fr', inFile])  

In [None]:
def writeCSV(resultDir, fileName, res):
    inFile = "/tmp/" + fileName
    outFile = resultDir + fileName
    
    decompositionFile = open(inFile, "w")
    res.T.tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    
    #Upload to HDFS
    subprocess.run(['hadoop', 'dfs', '-copyFromLocal', '-f', inFile, outFile])  

    #Remove from /tmp/
    subprocess.run(['rm', '-fr', inFile])

In [None]:
def plotMode(singularVector, shape):
    data = np.reshape(singularVector, shape)
    plt.figure(1)
    cmap = plt.cm.get_cmap('YlGn')
    img = plt.imshow(data.T, cmap = 'YlGn')
    plt.colorbar(orientation = 'horizontal')
    plt.clim(float(np.min(data.T)), float(np.max(data.T)))
    plt.axis('off')
    plt.show()

In [None]:
def runTest(dataDirectory1, dataDirectory2, resultDirectory):
    dprint("-------------------------------")
    dprint("Running runTest(dataDirectory1, dataDirectory2, resultDirectory)")
    dprint("Start time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")
    dprint("dataDirectory1: " + dataDirectory1)
    dprint("dataDirectory2: " + dataDirectory2)
    dprint("resultDirectory: " + resultDirectory)
    dprint("-------------------------------")

    dataSet1, dataSetIndex1, maxDimension1, plotShape1 = getDataSet(dataDirectory1)
    dataSet2, dataSetIndex2, maxDimension2, plotShape2 = getDataSet(dataDirectory2)
    dprint("dataSet1.shape: " + str(dataSet1.shape))
    dprint("dataSet2.shape: " + str(dataSet2.shape))
    maxDimension = max(max(dataSet1.shape), max(dataSet2.shape))
    minDimension = min(min(dataSet1.shape), min(dataSet2.shape))
    U, s, Vt = qrproductsvd(dataSet1, dataSet2)
    V = Vt.T
    new_index1 = pandas.Index(range(maxDimension1), name = "index")
    new_index2 = pandas.Index(range(maxDimension2), name = "index")
    UWithNan = np.array(pandas.DataFrame(U).reindex(dataSetIndex1).reindex(new_index1))
    VWithNan = np.array(pandas.DataFrame(V).reindex(dataSetIndex2).reindex(new_index2))
    dprint("U.shape: " + str(U.shape))
    dprint("s.shape: " + str(s.shape))
    dprint("V.shape: " + str(V.shape))
    dprint("UWithNan.shape: " + str(UWithNan.shape))
    dprint("VWithNan.shape: " + str(VWithNan.shape))
    dprint("Singular values of product: ")
    dprint(s)
    dprint("U.T[0][:minDimension]: ")
    dprint(U.T[0][:minDimension])
    dprint("V.T[0][:minDimension]: ")
    dprint(V.T[0][:minDimension])
    validNorms = validateNorms(dataSet1, dataSet2, U, s, V)
    dprint("Valid norms: " + str(validNorms))
    for i in range(len(s)):
        iString = str(i + 1).zfill(2)
        writeMode(resultDirectory, "ModeWithoutNan" + iString + ".txt", i, U, s, V)
        writeMode(resultDirectory, "ModeWithNan" + iString + ".txt", i, UWithNan, s, VWithNan)
        plotMode(UWithNan.T[i], plotShape1)
    writeCSV(resultDirectory, "U.csv", U)
    writeCSV(resultDirectory, "s.csv", s)
    writeCSV(resultDirectory, "V.csv", V)

    dprint("-------------------------------")
    dprint("Ending test")
    dprint("End time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")

## Tests

### Test 1

In [None]:
dprint("-------------------------------")
dprint("Running test 1")
dprint("Start time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomGridmet/"
dataDirectory2 = "hdfs:///user/hadoop/spring-index/LeafGridmet/"
resultDirectory = "hdfs:///user/pheno/svd/BloomGridmetLeafGridmet/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

dprint("-------------------------------")
dprint("Ending test 1")
dprint("End time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

### Test 2

In [None]:
dprint("-------------------------------")
dprint("Running test 2")
dprint("Start time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/spring-index/LeafFinalLow/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowLeafFinalLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

dprint("-------------------------------")
dprint("Ending test 2")
dprint("End time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

### Test 3

In [None]:
dprint("-------------------------------")
dprint("Running test 3")
dprint("Start time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowSOSTLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

dprint("-------------------------------")
dprint("Ending test 3")
dprint("End time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

### Test 4

In [None]:
dprint("-------------------------------")
dprint("Running test 4")
dprint("Start time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/LeafFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"
resultDirectory = "hdfs:///user/pheno/svd/LeafFinalLowSOSTLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

dprint("-------------------------------")
dprint("Ending test 4")
dprint("End time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

### Test 5

In [None]:
dprint("-------------------------------")
dprint("Running test 5")
dprint("Start time: " + str(datetime.datetime.now()))
dprint("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLowPR/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLowPR/"
resultDirectory = "hdfs:///user/pheno/svd/LeafFinalLowPRSOSTLowPR/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

dprint("-------------------------------")
dprint("Ending test 5")
dprint("End time: " + str(datetime.datetime.now()))
dprint("-------------------------------")