In [2]:
import os, sys

from random import randrange
from operator import itemgetter

from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.types import Row
from pyspark.sql import functions as F

In [3]:
spark = SparkSession.builder.master("spark://spark-master:7077")\
                            .appName("convtype")\
                            .config("spark.executor.memory", "6g")\
                            .config("spark.jars", "spark-xml_2.12-0.11.0.jar")\
                            .getOrCreate()

In [4]:
df = spark.read.format("xml") \
                .option("rowTag", "page") \
                .load("data/Wikipedia-20210208092423.xml")

In [5]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)



In [6]:
parsedDF = df.select("title", "revision.text._VALUE")

revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)

In [7]:
from pyspark.sql.functions import *
parsedDF = parsedDF.withColumn("parsed", trim(col("_VALUE"))).drop("_VALUE")

In [8]:
from pyspark.sql.functions import udf, col, lower, regexp_replace

In [9]:
df_clean = parsedDF.select('title', (lower(regexp_replace('parsed', "[^a-zA-Z0-9\\s]", " ")).alias('text')))

In [10]:
df_clean.show(truncate=False, n=2)

+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                      |text                                                                                                                                                                 |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Category:Symmetry          |  commons cat symmetry  
  cat main  

  category geometry  
  category theoretical physics  
  category artistic techniques  
  category aesthetics  
  catautotoc  |
|Category:Homogeneous spaces|  cat main homogeneous space  

  category geometry  
  category group actions  mathematics   
  category mathematical structures                                    |
+-------------------

In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [12]:
tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_words_token = tokenizer.transform(df_clean).select("title", "words_token")

In [13]:
df_words_token.show(truncate=False, n=2)

+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                      |words_token                                                                                                                                                                                                    |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Category:Symmetry          |[, , commons, cat, symmetry, , , , , cat, main, , , , , , category, geometry, , , , , category, theoretical, physics, , , , , category, artistic, techniques, , , , , category, aesthetics, , , , , catautotoc]|
|Category:Homogeneous spaces|[, , cat, main, hom

In [14]:
stopWords = spark.sparkContext.textFile('data/stopwords.txt')
stopWordList = stopWords.collect()

In [15]:
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean', stopWords=stopWordList)

In [16]:
df_no_stopWords = remover.transform(df_words_token).select('title', 'words_clean')

In [17]:
df_no_stopWords.show(truncate=False, n=2)

+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                      |words_clean                                                                                                                                                                                                    |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Category:Symmetry          |[, , commons, cat, symmetry, , , , , cat, main, , , , , , category, geometry, , , , , category, theoretical, physics, , , , , category, artistic, techniques, , , , , category, aesthetics, , , , , catautotoc]|
|Category:Homogeneous spaces|[, , cat, main, hom

In [18]:
from pyspark.sql.functions import pandas_udf,PandasUDFType
from pyspark.sql.types import *
import pandas as pd
import numpy as np
@pandas_udf(ArrayType(StringType()))
def func(v: pd.Series) -> pd.Series:
    res = []
    for row in v:
        res.append(row[row != ''])
    return pd.Series(res)

In [19]:
words_clean_remove_empty = df_no_stopWords.withColumn('text',func(df_no_stopWords.words_clean)).select('title', 'text')

In [20]:
words_clean_remove_empty.show(truncate=False, n=2)

+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                      |text                                                                                                                                                     |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|Category:Symmetry          |[commons, cat, symmetry, cat, main, category, geometry, category, theoretical, physics, category, artistic, techniques, category, aesthetics, catautotoc]|
|Category:Homogeneous spaces|[cat, main, homogeneous, space, category, geometry, category, group, actions, mathematics, category, mathematical, structures]                           |
+---------------------------+---------------------------------------------------

In [21]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
stemmed_words = words_clean_remove_empty.withColumn("SnowballStemmed", stemmer_udf("text"))

In [22]:
stemmed_words.show(truncate=False, n=2)

+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+
|title                      |text                                                                                                                                                     |SnowballStemmed                                                                                                                             |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+
|Category:Symmetry       

In [23]:
@pandas_udf(ArrayType(StringType()))
def func(v: pd.Series) -> pd.Series:
    res = []
    for tokens in v:
        res.append([stemmer.stem(token) for token in tokens])
    return pd.Series(res)

In [24]:
stemmed_words = stemmed_words.withColumn('pandas_udf_stemmed',func(words_clean_remove_empty.text))

In [25]:
stemmed_words_with_size = stemmed_words.withColumn("token_count", size(col("pandas_udf_stemmed")))

In [26]:
filtered_stemmed_words = stemmed_words_with_size.where(col("token_count") > 1)

In [27]:
print(stemmed_words_with_size.count())

150


In [28]:
print(filtered_stemmed_words.count())

150


In [29]:
filtered_stemmed_words.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- SnowballStemmed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pandas_udf_stemmed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- token_count: integer (nullable = false)



In [30]:
from pyspark.ml.feature import CountVectorizer

countVectorizer = CountVectorizer(inputCol="pandas_udf_stemmed",
                                      outputCol="termFreqs",
                                      vocabSize=20000)
vocabModel = countVectorizer.fit(filtered_stemmed_words)
docTermFreqs = vocabModel.transform(filtered_stemmed_words)

In [31]:
docTermFreqs.cache()

DataFrame[title: string, text: array<string>, SnowballStemmed: array<string>, pandas_udf_stemmed: array<string>, token_count: int, termFreqs: vector]

In [32]:
docTermFreqs.show(n=1)

+-----------------+--------------------+--------------------+--------------------+-----------+--------------------+
|            title|                text|     SnowballStemmed|  pandas_udf_stemmed|token_count|           termFreqs|
+-----------------+--------------------+--------------------+--------------------+-----------+--------------------+
|Category:Symmetry|[commons, cat, sy...|[common, cat, sym...|[common, cat, sym...|         16|(13550,[7,56,91,1...|
+-----------------+--------------------+--------------------+--------------------+-----------+--------------------+
only showing top 1 row



In [33]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="termFreqs", 
          outputCol="tfidfVec")
idfModel = idf.fit(docTermFreqs)
docTermMatrix = idfModel.transform(docTermFreqs).select("title", "tfidfVec")

In [34]:
docTermMatrix.show()

+--------------------+--------------------+
|               title|            tfidfVec|
+--------------------+--------------------+
|   Category:Symmetry|(13550,[7,56,91,1...|
|Category:Homogene...|(13550,[7,14,24,3...|
|       Ambient space|(13550,[0,1,2,3,5...|
|Category:Duality ...|(13550,[1,3,7,38,...|
|          Superspace|(13550,[0,1,2,3,4...|
|     Geometry Center|(13550,[0,1,2,5,6...|
|          Dehn plane|(13550,[0,1,3,5,6...|
|Complex reflectio...|(13550,[0,1,2,3,4...|
|    Lipschitz domain|(13550,[0,1,2,3,5...|
|        Complex line|(13550,[2,7,8,10,...|
|Visibility (geome...|(13550,[2,5,6,7,8...|
|   Spacetime diagram|(13550,[0,1,2,3,4...|
|Partial linear space|(13550,[0,1,3,5,7...|
| Geometry processing|(13550,[0,1,2,3,5...|
|          Hatch mark|(13550,[0,1,2,3,5...|
|       Lattice plane|(13550,[2,5,7,12,...|
|Tarski's plank pr...|(13550,[0,1,2,3,4...|
|     Visual calculus|(13550,[0,2,3,5,6...|
|          Benz plane|(13550,[0,1,2,3,5...|
|Infinitely near p...|(13550,[0,

In [35]:
from pyspark.sql.functions import monotonically_increasing_id

docTermFreqswithID = docTermFreqs.withColumn('id', monotonically_increasing_id()).cache()

In [36]:
docTermFreqswithID.show()

+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+---+
|               title|                text|     SnowballStemmed|  pandas_udf_stemmed|token_count|           termFreqs| id|
+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+---+
|   Category:Symmetry|[commons, cat, sy...|[common, cat, sym...|[common, cat, sym...|         16|(13550,[7,56,91,1...|  0|
|Category:Homogene...|[cat, main, homog...|[cat, main, homog...|[cat, main, homog...|         13|(13550,[7,14,24,3...|  1|
|       Ambient space|[short, descripti...|[short, descript,...|[short, descript,...|        312|(13550,[0,1,2,3,5...|  2|
|Category:Duality ...|[portal, mathemat...|[portal, mathemat...|[portal, mathemat...|         48|(13550,[1,3,7,38,...|  3|
|          Superspace|[superspace, coor...|[superspac, coord...|[superspac, coord...|       1512|(13550,[0,1,2,3,4...|  4|
|     Geometry C

In [37]:
docTermMatrix.printSchema()

root
 |-- title: string (nullable = true)
 |-- tfidfVec: vector (nullable = true)



In [38]:
from pyspark.mllib.linalg.distributed import RowMatrix

[ml 과 mllib 벡터](https://stackoverflow.com/questions/41074182/cannot-convert-type-class-pyspark-ml-linalg-sparsevector-into-vector)

In [39]:
from pyspark.mllib.util import MLUtils
vecDF = MLUtils.convertVectorColumnsFromML(docTermMatrix, "tfidfVec")
vecDF.show()

+--------------------+--------------------+
|               title|            tfidfVec|
+--------------------+--------------------+
|   Category:Symmetry|(13550,[7,56,91,1...|
|Category:Homogene...|(13550,[7,14,24,3...|
|       Ambient space|(13550,[0,1,2,3,5...|
|Category:Duality ...|(13550,[1,3,7,38,...|
|          Superspace|(13550,[0,1,2,3,4...|
|     Geometry Center|(13550,[0,1,2,5,6...|
|          Dehn plane|(13550,[0,1,3,5,6...|
|Complex reflectio...|(13550,[0,1,2,3,4...|
|    Lipschitz domain|(13550,[0,1,2,3,5...|
|        Complex line|(13550,[2,7,8,10,...|
|Visibility (geome...|(13550,[2,5,6,7,8...|
|   Spacetime diagram|(13550,[0,1,2,3,4...|
|Partial linear space|(13550,[0,1,3,5,7...|
| Geometry processing|(13550,[0,1,2,3,5...|
|          Hatch mark|(13550,[0,1,2,3,5...|
|       Lattice plane|(13550,[2,5,7,12,...|
|Tarski's plank pr...|(13550,[0,1,2,3,4...|
|     Visual calculus|(13550,[0,2,3,5,6...|
|          Benz plane|(13550,[0,1,2,3,5...|
|Infinitely near p...|(13550,[0,

In [40]:
vecRDD = vecDF.select("tfidfVec").rdd.flatMap(lambda x:x)

mat = RowMatrix(vecRDD)

In [41]:
svd = mat.computeSVD(50, computeU=True)

In [42]:
termIds = vocabModel.vocabulary

In [43]:
docIds = docTermFreqswithID.select(create_map('id', 'title').alias('map'))

In [44]:
docIds.show(n=1)

+--------------------+
|                 map|
+--------------------+
|[0 -> Category:Sy...|
+--------------------+
only showing top 1 row



In [45]:
docIds.collect()

[Row(map={0: 'Category:Symmetry'}),
 Row(map={1: 'Category:Homogeneous spaces'}),
 Row(map={2: 'Ambient space'}),
 Row(map={3: 'Category:Duality theories'}),
 Row(map={4: 'Superspace'}),
 Row(map={5: 'Geometry Center'}),
 Row(map={6: 'Dehn plane'}),
 Row(map={7: 'Complex reflection group'}),
 Row(map={8: 'Lipschitz domain'}),
 Row(map={9: 'Complex line'}),
 Row(map={10: 'Visibility (geometry)'}),
 Row(map={11: 'Spacetime diagram'}),
 Row(map={12: 'Partial linear space'}),
 Row(map={13: 'Geometry processing'}),
 Row(map={14: 'Hatch mark'}),
 Row(map={15: 'Lattice plane'}),
 Row(map={16: "Tarski's plank problem"}),
 Row(map={17: 'Visual calculus'}),
 Row(map={18: 'Benz plane'}),
 Row(map={19: 'Infinitely near point'}),
 Row(map={20: 'Isogonal'}),
 Row(map={21: 'Axis-aligned object'}),
 Row(map={22: 'Coplanarity'}),
 Row(map={23: 'Real tree'}),
 Row(map={24: 'Manipulability ellipsoid'}),
 Row(map={25: "Cavalieri's principle"}),
 Row(map={26: 'Flatness (mathematics)'}),
 Row(map={27: 'Corn

In [46]:
v = svd.V

In [47]:
arr = v.toArray()

In [48]:
arr

array([[-2.82703518e-01,  1.64697759e-01,  1.47379783e-01, ...,
        -2.68032486e-02,  7.88174405e-02, -9.40086762e-02],
       [-1.08884980e-01,  4.19537196e-02,  2.17882971e-02, ...,
         2.76171395e-02, -2.58898615e-02, -4.08647334e-02],
       [-5.24200208e-02,  3.51223888e-02, -3.53020490e-02, ...,
         5.43403774e-03,  2.41877855e-02,  3.98727601e-03],
       ...,
       [-1.43369658e-04,  5.81407408e-05, -8.93747465e-05, ...,
        -5.57202907e-04, -2.35575798e-04, -1.67956605e-04],
       [-9.26131114e-05,  3.12016253e-05,  9.60880822e-05, ...,
        -4.52348849e-04, -3.38346245e-04, -4.05417903e-04],
       [-5.03847318e-05,  2.00135761e-05, -4.67463228e-06, ...,
         2.96339230e-04, -1.43502647e-05, -6.19633881e-04]])

In [49]:
transposedArr = arr.transpose()

In [51]:
arr[0]
len(arr[0])

50

In [52]:
len(arr)

13550

In [53]:
def topTermsInTopConcepts(svd, numConcepts, numTerms, termIds):
    arr = svd.V.toArray().transpose()
    res = []
    for i,v  in enumerate(arr):
        if( i > numConcepts ): break

        v = list(enumerate(v))
        v.sort(key=lambda x : x[1], reverse=True)
        v = v[0:numTerms]
        v = list((termIds[termId], score) for termId, score in v)
        res.append(v)
    return res

In [54]:
topTermsInTopConcepts(svd, 4, 10, termIds)

[[('geometri', 1.5678831981049865e-19),
  ('categori', -6.094395078157064e-19),
  ('wpss', -4.974050824577774e-08),
  ('newstub', -4.9740508245792666e-08),
  ('catrel', -1.8113050476640275e-07),
  ('spaceflight', -5.381884622449594e-07),
  ('disambig', -6.997036659610312e-07),
  ('catautotoc', -8.329525916104524e-07),
  ('gem', -9.392279543308255e-07),
  ('cs348b', -9.392279543312185e-07)],
 [('phi', 0.18799956042473137),
  ('math', 0.1646977587223749),
  ('t', 0.1303945111415621),
  ('diffeomorph', 0.11638211453415771),
  ('anatomi', 0.10731195094533721),
  ('v', 0.10024116137011954),
  ('match', 0.0758741837413897),
  ('x', 0.07161059203462791),
  ('sub', 0.06966893576247948),
  ('varphi', 0.0690792482638371)],
 [('sub', 0.5352921187049191),
  ('y', 0.15643338031422663),
  ('math', 0.14737978347414654),
  ('h', 0.13018148632796064),
  ('gamma', 0.1256484507454903),
  ('x', 0.1117757223295788),
  ('busemann', 0.09606925447307893),
  ('hadamard', 0.09496404110893532),
  ('sup', 0.08947

In [55]:
def topDocsInTopConcept(svd, numConcepts, numDocs, docIds):
    u = svd.U
    res = []

    for i, u in enumerate(u.rows.map(lambda i : i.toArray()).collect()):
        if( i > numConcepts ): break
        u = list(enumerate(u))
        u.sort(key=lambda x: x[1], reverse=True)
        u = u[0:numDocs]
        u = list((docIds.collect()[docId][0][docId], score) for docId, score in u)
        res.append(u)
    return res

In [56]:
topDocsInTopConcept(svd, 4, 10, docIds)

[[('Hatch mark', 0.002544294286500924),
  ('Transversality (mathematics)', 0.0010153629251844684),
  ('Supporting line', 0.0009310251030893441),
  ("Cavalieri's principle", 0.000765911098599488),
  ('Geometry processing', 0.0007434422443675006),
  ('Timeline of geometry', 0.0006033744774238724),
  ('Outline of geometry', 0.0005058739268164644),
  ('Mathematical visualization', 0.0003962247715992711),
  ('Quasisymmetric map', 0.00036315619310208283),
  ("Tarski's plank problem", 0.0003349396551846728)],
 [("Cavalieri's principle", 0.0011419559849047338),
  ('Outline of geometry', 0.0010811961779002002),
  ('Mathematical visualization', 0.0009448482480736536),
  ('Supporting line', 0.0008373239410025579),
  ('Isogonal', 0.0006324171236660299),
  ('Unit hyperbola', 0.0004245118471315644),
  ('Infinitely near point', 0.00042350087679640343),
  ('Category:History of geometry', 0.00031575871801271696),
  ('Complex line', 0.0002953242113434343),
  ('Geometry and topology', 0.00028865409890449

In [58]:
u = svd.U
s = svd.s
v = svd.V

print(type(u))
print(type(s))
print(type(v))

<class 'pyspark.mllib.linalg.distributed.RowMatrix'>
<class 'pyspark.mllib.linalg.DenseVector'>
<class 'pyspark.mllib.linalg.DenseMatrix'>


In [65]:
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

s_distributed = spark.sparkContext.parallelize(np.diag(s.toArray())).zipWithIndex() # 대각 행렬
s_distributed = IndexedRowMatrix(s_distributed.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix()

In [74]:
v_distributed = spark.sparkContext.parallelize(v.toArray()).zipWithIndex()
v_distributed = IndexedRowMatrix(v_distributed.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix()

In [75]:
print(type(s_distributed))
print(type(v_distributed))

<class 'pyspark.mllib.linalg.distributed.BlockMatrix'>
<class 'pyspark.mllib.linalg.distributed.BlockMatrix'>


In [76]:
sv_distributed = v_distributed.multiply(s_distributed)

In [84]:
sv_dotted = sv_distributed.blocks.collect()

In [85]:
np.shape(sv_dotted)

  return array(a, dtype, copy=False, order=order)


(14, 2)

In [82]:
# local
local_v = v.toArray()
local_s = np.diag(s.toArray())
sv = np.dot(local_v, local_s)

print(f"{np.shape(local_v)} dot {np.shape(local_s)} -> {np.shape(sv)}" )

(13550, 50) dot (50, 50) -> (13550, 50)


In [83]:
from sklearn.preprocessing import normalize

ModuleNotFoundError: No module named 'sklearn'