In [14]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StringType, DateType, TimestampType

import plotly.graph_objects as go
import plotly.figure_factory as ff

In [2]:
df = spark.read.json("/data/new/")

In [3]:
df.count()

35483

In [12]:
categories = df.groupBy("category").agg(f.count("*").alias("cnt")).toPandas()

In [13]:
data = [go.Bar(x=categories.category, y=categories.cnt)]
fig = go.Figure(data=data)
fig.show()

In [22]:
from pyspark.ml.feature import RegexTokenizer, Word2Vec, StopWordsRemover
from pyspark.ml import Pipeline

In [20]:
regexTokenizer = RegexTokenizer(
    gaps=False, 
    pattern='\w+', 
    inputCol='abstract', 
    outputCol='abstract_token'
)

In [21]:
swr = StopWordsRemover(
    inputCol='abstract_token', 
    outputCol='abstract_sw_removed'
)

In [23]:
word2vec = Word2Vec(
    vectorSize=10, 
    minCount=5, 
    inputCol='abstract_sw_removed', 
    outputCol='abstract_embedding'
)

In [25]:
pipeline = Pipeline(stages=[regexTokenizer, swr, word2vec])

In [26]:
result = pipeline.fit(df).transform(df)

In [27]:
result.select('abstract_embedding').show()

+--------------------+
|  abstract_embedding|
+--------------------+
|[-0.1273724277537...|
|[0.15270241799477...|
|[-0.3203889724994...|
|[-0.1457175488610...|
|[0.03539156706188...|
|[-0.0373621375461...|
|[-0.1193792162585...|
|[-0.1660933268182...|
|[-0.1168947401928...|
|[0.04187856778914...|
|[0.14161779064131...|
|[0.26135577482637...|
|[0.35225615823641...|
|[0.16570435597158...|
|[-0.0915137479999...|
|[-0.0147896639713...|
|[-0.0751811000613...|
|[-0.1277646767615...|
|[0.10511150366840...|
|[-0.0643902558871...|
+--------------------+
only showing top 20 rows

