In [None]:
from pyspark.mllib.linalg import Vectors,DenseVector
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg.distributed import RowMatrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


In [None]:
class SVD(JavaModelWrapper):
    """Wrapper around the SVD scala case class"""
    @property
    def U(self):
        """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True."""
        u = self.call("U")
        if u is not None:
        	return RowMatrix(u)

    @property
    def s(self):
        """Returns a DenseVector with singular values in descending order."""
        return self.call("s")

    @property
    def V(self):
        """ Returns a DenseMatrix whose columns are the right singular vectors of the SVD."""
        return self.call("V")

In [None]:
def computeSVD(row_matrix, k, computeU=False, rCond=1e-9):
    """
    Computes the singular value decomposition of the RowMatrix.
    The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
    * s: DenseVector consisting of square root of the eigenvalues (singular values) in descending order.
    * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the eigenvectors of (A X A')
    * v: (n X k) (right singular vectors) is a Matrix whose columns are the eigenvectors of (A' X A)
    :param k: number of singular values to keep. We might return less than k if there are numerically zero singular values.
    :param computeU: Whether of not to compute U. If set to be True, then U is computed by A * V * sigma^-1
    :param rCond: the reciprocal condition number. All singular values smaller than rCond * sigma(0) are treated as zero, where sigma(0) is the largest singular value.
    :returns: SVD object
    """
    java_model = row_matrix._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
    return SVD(java_model)

In [None]:
k = 1000
lem = WordNetLemmatizer()

def nomequalquer(line):
    #return [lem.lemmatize(word) for word in tokenized]
    return ' '.join([word for word in word_tokenize(line) if word not in stopwords.words('english')])
    
documents = sc.textFile("anarchism.txt").map(nomequalquer)

hashingTF = HashingTF()
tf = hashingTF.transform(documents)
#tf.take(1)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
dense_tfidf = tfidf.map(lambda l: DenseVector(l.toArray()))
dense_tfidf.take(1)
# mat = RowMatrix(tfidf)
# svd = computeSVD(mat,k,True)