In [0]:
import org.apache.spark.sql.SparkSession

val spark= SparkSession
    .builder()
    .config("spark.jars", "/opt/jar/spark-xml_2.12-0.9.0.jar")
    .appName("spark get xml data from hdfs")
    .getOrCreate()

In [1]:
import com.databricks.spark.xml._

val rawXML = spark.read.option("rowTag", "MedlineCitation").xml("hdfs://namenode:8020/medline/medsamp2016a.xml")

In [2]:
rawXML.printSchema()

In [3]:
import spark.implicits._
val meshHeadlingList = rawXML.select("MeshHeadingList.MeshHeading")


In [4]:
meshHeadlingList.printSchema()

In [5]:
meshHeadlingList.show()

In [6]:
meshHeadlingList.printSchema()

In [7]:
val MeshHeadlingElems = meshHeadlingList.withColumn("data", explode($"MeshHeading")).select("data")

In [8]:
MeshHeadlingElems.printSchema()

In [9]:
val descriptorName = MeshHeadlingElems.select(MeshHeadlingElems.col("data.DescriptorName"))
descriptorName.printSchema()

In [10]:
val parsedDF = descriptorName.select(descriptorName.col("DescriptorName._MajorTopicYN"),
                                    descriptorName.col("DescriptorName._Type"),
                                    descriptorName.col("DescriptorName._UI"),
                                    descriptorName.col("DescriptorName._VALUE"))

In [11]:
parsedDF.show()

In [12]:
val majorTopic = parsedDF.filter(col("_MajorTopicYN") === "Y")

In [13]:
val toppicDist = majorTopic.groupBy("_VALUE").count()

In [14]:
toppicDist.orderBy(desc("count")).show()

In [15]:
majorTopic.show(false)

In [16]:
val majorTopicVal = majorTopic.select(col("_VALUE").as("topic")).cache()

In [17]:
majorTopicVal.show(false)

In [18]:
val topics = majorTopicVal.select("topic").rdd.map(el => el.getString(0).split(",").toList)

In [19]:
val onlyTopics =  topics.flatMap(mesh => mesh).toDF("topic")

In [20]:
val topicPairs = topics.flatMap(t => {t.sorted.combinations(2)}).toDF("pairs")
topicPairs.createOrReplaceTempView("topic_pairs")
val coccurs = spark.sql("""
    SELECT pairs, COUNT(*) cnt
    FROM topic_pairs
    GROUP BY pairs""")


In [21]:
cooccurs.createOrReplaceTempView("cooccurs")
spark.sql("""
    SELECT pairs, cnt
    FROM cooccurs
    ORDER BY cnt DESC
    LIMIT 10""").collect().foreach(println)

In [22]:
import java.nio.charset.StandardCharsets
import java.security.MessageDigest

def hashID(str: String): Long = {
    val bytes = MessageDigest.getInstance("MD5").digest(str.getBytes(StandardCharsets.UTF_8))
    (bytes(0) & 0xFFL) |
    ((bytes(1) & 0xFFL) << 8)  |
    ((bytes(2) & 0xFFL) << 16) |
    ((bytes(3) & 0xFFL) << 24) | 
    ((bytes(4) & 0xFFL) << 32) |
    ((bytes(5) & 0xFFL) << 40) |
    ((bytes(6) & 0xFFL) << 48) |
    ((bytes(7) & 0xFFL) << 56)
}

In [23]:
import org.apache.spark.sql.Row

val vertices = onlyTopics.map{ case Row(topic: String) => (hashID(topic), topic) }.toDF("hash", "topic")
vertices.show(false)

In [24]:
import org.apache.spark.graphx._

val edges = cooccurs.map{ case Row(pairs: Seq[_], cnt: Long) =>
    val ids = pairs.map(_.toString).map(hashID).sorted
    Edge(ids(0), ids(1), cnt)
}

In [25]:
val vertexRDD = vertices.rdd.map{
    case Row(hash: Long, topic: String) => (hash, topic)
}
val topicGraph = Graph(vertexRDD, edges.rdd)
topicGraph.cache()

In [26]:
val connectedComponentGraph = topicGraph.connectedComponents()

In [27]:
val componentDF = connectedComponentGraph.vertices.toDF("vid", "cid")
componentDF.show(false)

In [28]:
val componentCounts = componentDF.groupBy("cid").count()
componentCounts.count()

In [29]:
z.show(componentCounts.orderBy(desc("count")))

In [30]:
import org.apache.spark.sql.functions.split

val topicSplittopicSplit = majorTopicVal.withColumn("topic_split", split(col("topic"), ","))
topicSplit.withColumn("size", size(col("topic_split"))).groupBy("topic_split", "size").count().filter(col("size") === 2).orderBy(desc("count")).show(false)

In [31]:
val topicSplitExp = topicSplit.withColumn("topic1", explode(col("topic_split")))

In [32]:
val topicTrimmed = topicSplitExp.withColumn("topic1", trim(col("topic1")))

In [33]:
topicTrimmed.show

In [34]:
val topics = topicTrimmed.select("topic", "topic1")
topics.show(false)