In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.mllib.linalg import Vectors,DenseVector
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg.distributed import RowMatrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
SparkContext.setSystemProperty('spark.executor.memory', '6g')
sc = SparkContext("local", "App Name")

In [3]:
class SVD(JavaModelWrapper):
    """Wrapper around the SVD scala case class"""
    @property
    def U(self):
        """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True."""
        u = self.call("U")
        if u is not None:
        	return RowMatrix(u)

    @property
    def s(self):
        """Returns a DenseVector with singular values in descending order."""
        return self.call("s")

    @property
    def V(self):
        """ Returns a DenseMatrix whose columns are the right singular vectors of the SVD."""
        return self.call("V")


In [4]:
def computeSVD(row_matrix, k, computeU=False, rCond=1e-9):
    """
    Computes the singular value decomposition of the RowMatrix.
    The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
    * s: DenseVector consisting of square root of the eigenvalues (singular values) in descending order.
    * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the eigenvectors of (A X A')
    * v: (n X k) (right singular vectors) is a Matrix whose columns are the eigenvectors of (A' X A)
    :param k: number of singular values to keep. We might return less than k if there are numerically zero singular values.
    :param computeU: Whether of not to compute U. If set to be True, then U is computed by A * V * sigma^-1
    :param rCond: the reciprocal condition number. All singular values smaller than rCond * sigma(0) are treated as zero, where sigma(0) is the largest singular value.
    :returns: SVD object
    """
    java_model = row_matrix._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
    return SVD(java_model)

def pre_process(line):
    return ' '.join([stemmer.stem(word) for word in word_tokenize(line) if word not in stopwords.words('english')])

In [14]:
def topDocsInTopConcepts(svd, numConcepts, numDocs, docIds):
    u = svd.U
    for i in range(numConcepts):
        docWeights = u.rows.map(lambda x: x.toArray[i])
    return docWeights

In [9]:
k = 200
stemmer = SnowballStemmer('english')    
documents = sc.textFile("anarchism.txt").map(pre_process)
hashingTF = HashingTF(numFeatures=500)
tf = hashingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf).cache()

In [10]:
mat = RowMatrix(tfidf)
svd = computeSVD(mat,k,True)

In [15]:
topDocsInTopConcepts(svd, len())

TypeError: len() takes exactly one argument (0 given)