In [None]:
import findspark
findspark.init()

from operator import add
from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.mllib.linalg import Vectors,DenseVector
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg.distributed import RowMatrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from math import log

In [None]:
sc = SparkContext("local", "App Name")

In [None]:
class SVD(JavaModelWrapper):
    """Wrapper around the SVD scala case class"""
    @property
    def U(self):
        """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True."""
        u = self.call("U")
        if u is not None:
        	return RowMatrix(u)

    @property
    def s(self):
        """Returns a DenseVector with singular values in descending order."""
        return self.call("s")

    @property
    def V(self):
        """ Returns a DenseMatrix whose columns are the right singular vectors of the SVD."""
        return self.call("V")


In [None]:
def computeSVD(row_matrix, k, computeU=False, rCond=1e-9):
    """
    Computes the singular value decomposition of the RowMatrix.
    The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
    * s: DenseVector consisting of square root of the eigenvalues (singular values) in descending order.
    * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the eigenvectors of (A X A')
    * v: (n X k) (right singular vectors) is a Matrix whose columns are the eigenvectors of (A' X A)
    :param k: number of singular values to keep. We might return less than k if there are numerically zero singular values.
    :param computeU: Whether of not to compute U. If set to be True, then U is computed by A * V * sigma^-1
    :param rCond: the reciprocal condition number. All singular values smaller than rCond * sigma(0) are treated as zero, where sigma(0) is the largest singular value.
    :returns: SVD object
    """
    java_model = row_matrix._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
    return SVD(java_model)

def pre_process(line):
    return [stemmer.stem(word) for word in word_tokenize(line) if word not in stopwords.words('english')]

In [None]:
def map_tf(document):
    doc_map = {}
    for term in document:
        if not term in doc_map:
            doc_map[term] = 0
        doc_map[term] += 1
    return [(x, doc_map[x]) for x in doc_map]

In [None]:
k = 200
stemmer = SnowballStemmer('english')    
documents = sc.textFile("anarchism_clean.txt").map(pre_process)
docTermFreqs = documents.map(map_tf).cache()


docFreqs = docTermFreqs.flatMap(lambda x : x).reduceByKey(add)
num_docs = docTermFreqs.count()

In [None]:
idfs = docFreqs.map(lambda x: (x[0], log(num_docs/x[1])))
idTerms = idfs.keys().zipWithIndex()
term_ids = idTerms.map(lambda x: tuple(reversed(x)))

dict_id_terms = dict(idTerms.collect())
dict_terms_id = dict(term_ids.collect())
dict_term_freqs = dict(docFreqs.collect())
dict_idfs = dict(idfs.collect())
num_terms = len(dict_id_terms)

In [None]:
def map_funcao1(termFreqs):
    docTotalTerms = sum([value[1] for value in termFreqs])
    return Vectors.sparse(num_terms, \
                          [(dict_id_terms[term[0]], dict_idfs[term[0]]*dict_term_freqs[term[0]]/docTotalTerms) \
                           for term in termFreqs])
    
vecs = docFreqs.map(map_funcao1)

In [None]:
mat = RowMatrix(vecs)
svd = computeSVD(mat,k,True)

In [None]:
def topTerms(svd, numConcepts, numTerms, termsIds):
    v = svd.V
    topTerms = []
    arr = v.toArray().ravel()
    for i in range(numConcepts):
        offs = i*v.numRows
        termWeights = [(termsIds[j], arr[j]) for j in range(0, v.numRows)]
#         print (termWeights[0])
        weights_sorted = sorted(termWeights, key=lambda x: x[1].all(), reverse=True)
        topTerms += weights_sorted
    return topTerms[:num_terms]
        
    

def topDocsInTopConcepts(svd, numConcepts, numDocs, docIds):
    u = svd.U
    for i in range(numConcepts):
        docWeights = u.rows.map(lambda x: x.toArray[i])
    return docWeights



In [None]:
topTerms(svd, k, 10, dict_terms_id)