In [None]:
/scalable-data-science/000_0-sds-3-x-projects/student-project-01_group-TheTwoCultures/01_load_data

In [None]:
import org.apache.spark.sql.functions.{col,concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
import org.apache.spark.sql.types.{ ArrayType, StructType, StructField, StringType, IntegerType }
import com.databricks.spark.xml._ // Add the DataFrame.read.xml() method
import org.apache.spark.sql.functions._

def read_xml(file_name: String): org.apache.spark.sql.DataFrame = {
  val word_schema = StructType(Array(
    StructField("_lemma", StringType, nullable=false)))

  val sentence_schema = StructType(Array(
    StructField("w", ArrayType(StringType, true), nullable=true)
  ))
  val text_schema = StructType(Array(
     StructField("sentence", ArrayType(sentence_schema), nullable=false)
  ))

  val thread_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("text", ArrayType(text_schema), nullable=false)
  ))

  val forum_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("thread", ArrayType(thread_schema), nullable=false)
  ))

  val corpus_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("forum", forum_schema, nullable=false)
  ))

  spark.read
    .option("rowTag", "forum")
    .schema(forum_schema)
    .xml(file_name)//.cache()
 }


def get_dataset(file_name: String) : org.apache.spark.sql.DataFrame = {
  val xml_df = read_xml(file_name)
  val splitted_name = file_name.split("/")
  val forum = splitted_name(splitted_name.size-2)
  val corpus = splitted_name(splitted_name.size-1)
  val value = udf((arr: Seq[String]) => arr.mkString(","))
  xml_df.select(//col("_id") as "corpus_id",
                     col("_id") as "forum_id",
                     col("_title") as "forum_title",
                     col("thread._id") as "thread_id",
                     col("thread._title") as "thread_title",
                     flatten(col("thread.text.sentence.w")) as "w")
                .withColumn("w", explode($"w"))
               .groupBy("thread_id")
               .agg(first("thread_title") as("thread_title"),
                    collect_list("w") as "w",
                    first("forum_id") as "forum_id",
                    first("forum_title") as "forum_title")//,
                    //first("corpus_id") as "corpus_id")
               .withColumn("w", value($"w"))
               .withColumn("thread_title", value($"thread_title"))
               .withColumn("forum_id", value($"forum_id"))
               .withColumn("forum_title", value($"forum_title"))
               .withColumn("platform", lit(forum))
               .withColumn("corpus_id", lit(corpus))//($"corpus_id"))
}

  

>     import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
>     import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType, IntegerType}
>     import com.databricks.spark.xml._
>     import org.apache.spark.sql.functions._
>     read_xml: (file_name: String)org.apache.spark.sql.DataFrame
>     get_dataset: (file_name: String)org.apache.spark.sql.DataFrame

In [None]:
val file_name = "dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml"
val df = read_xml(file_name).cache()
df.printSchema()

  

>     root
>      |-- _id: string (nullable = false)
>      |-- _title: string (nullable = false)
>      |-- _url: string (nullable = false)
>      |-- thread: array (nullable = false)
>      |    |-- element: struct (containsNull = true)
>      |    |    |-- _id: string (nullable = false)
>      |    |    |-- _title: string (nullable = false)
>      |    |    |-- _url: string (nullable = false)
>      |    |    |-- text: struct (nullable = false)
>      |    |    |    |-- sentence: array (nullable = false)
>      |    |    |    |    |-- element: struct (containsNull = true)
>      |    |    |    |    |    |-- w: array (nullable = true)
>      |    |    |    |    |    |    |-- element: string (containsNull = true)
>
>     file_name: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml
>     df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_id: string, _title: string ... 2 more fields]

In [None]:
df.select("thread._id").count()

  

>     res122: Long = 44

In [None]:
display(df.select("thread._id"))

In [None]:
tmp.show(1)

  

>     +--------------------+
>     |                   w|
>     +--------------------+
>     |[[Kostnader, finn...|
>     +--------------------+
>     only showing top 1 row

In [None]:
df.select(flatten(col("thread"))).count()

In [None]:
df.select(flatten(col("thread.text.sentence.w")) as "w").count()

  

>     res102: Long = 44

In [None]:
def read_xml(file_name: String): org.apache.spark.sql.DataFrame = {
  val word_schema = StructType(Array(
    StructField("_lemma", StringType, nullable=false)))

  val sentence_schema = StructType(Array(
    StructField("w", ArrayType(StringType, true), nullable=true)
  ))
  val text_schema = StructType(Array(
    StructField("_url", StringType, nullable = false),
     StructField("sentence", ArrayType(sentence_schema), nullable=false)
  ))

  val thread_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("text", ArrayType(text_schema), nullable=false)
  ))

  val forum_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("thread", ArrayType(thread_schema), nullable=false)
  ))

  val corpus_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("forum", forum_schema, nullable=false)
  ))

  spark.read
    .option("rowTag", "text")
    .schema(text_schema)
    .xml(file_name)//.cache()
 }

  

>     read_xml: (file_name: String)org.apache.spark.sql.DataFrame

In [None]:
val file_name = "dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml"
val df = read_xml(file_name).cache()

  

>     file_name: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml
>     df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_url: string, sentence: array<struct<w:array<string>>>]

In [None]:
df.printSchema()

  

>     root
>      |-- _url: string (nullable = false)
>      |-- sentence: array (nullable = false)
>      |    |-- element: struct (containsNull = true)
>      |    |    |-- w: array (nullable = true)
>      |    |    |    |-- element: string (containsNull = true)

  

>     import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
>     import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType, IntegerType}
>     import com.databricks.spark.xml._
>     import org.apache.spark.sql.functions._
>     read_xml: (file_name: String)org.apache.spark.sql.DataFrame
>     get_dataset: (file_name: String)org.apache.spark.sql.DataFrame

In [None]:
import org.apache.spark.sql.functions._
val df = read_xml(file_name)

  

>     import org.apache.spark.sql.functions._
>     df: org.apache.spark.sql.DataFrame = [_url: string, sentence: array<struct<w:array<string>>>]

In [None]:
df.printSchema()

  

>     root
>      |-- _url: string (nullable = false)
>      |-- sentence: array (nullable = false)
>      |    |-- element: struct (containsNull = true)
>      |    |    |-- w: array (nullable = true)
>      |    |    |    |-- element: string (containsNull = true)

In [None]:
val splitted_name = file_name.split("/")
val forum = splitted_name(splitted_name.size-2)
val corpus = splitted_name(splitted_name.size-1)
val value = udf((arr: Seq[String]) => arr.mkString(","))
val tmp_df = df.select(//col("_id") as "corpus_id",
                     col("_url") as "url",
                     flatten(col("sentence.w")) as "w")
                .withColumn("w", explode($"w"))
               .groupBy("url")
               .agg(first("url"),
                    collect_list("w") as "w")
               .withColumn("w", value($"w"))
               .withColumn("url", value($"url"))
               .withColumn("platform", lit(forum))
               .withColumn("corpus_id", lit(corpus))//($"corpus_id"))

  

>     splitted_name: Array[String] = Array(dbfs:, datasets, student-project-01, familjeliv, familjeliv-adoption.xml)
>     forum: String = familjeliv
>     corpus: String = familjeliv-adoption.xml
>     value: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$8471/607595883@17f79ae6,StringType,List(Some(class[value[0]: array<string>])),None,true,true)
>     tmp_df: org.apache.spark.sql.DataFrame = [url: string, first(url): string ... 3 more fields]

In [None]:
tmp_df.show()

In [None]:
df.count()

  

>     res72: Long = 701364

In [None]:
  val xml_df = read_xml(file_name)
  val splitted_name = file_name.split("/")
  val forum = splitted_name(splitted_name.size-2)
  val corpus = splitted_name(splitted_name.size-1)
  val value = udf((arr: Seq[String]) => arr.mkString(","))
  val tmp_df = df.select(col("_id") as "forum_id",
                         col("_title") as "forum_title",
                         flatten(col("thread")) as "thread")
                //.withColumn("thread_id", explode($"thread_id"))

In [None]:
xml_df.where($"thread._id" === "68790623").show()

  

  

>     file_name: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml
>     df: org.apache.spark.sql.DataFrame = [thread_id: string, thread_title: string ... 5 more fields]

In [None]:
df.printSchema()

  

>     root
>      |-- _id: string (nullable = false)
>      |-- _title: string (nullable = false)
>      |-- _url: string (nullable = false)
>      |-- thread: struct (nullable = false)
>      |    |-- _id: string (nullable = false)
>      |    |-- _title: string (nullable = false)
>      |    |-- _url: string (nullable = false)
>      |    |-- text: struct (nullable = false)
>      |    |    |-- sentence: array (nullable = false)
>      |    |    |    |-- element: struct (containsNull = true)
>      |    |    |    |    |-- w: array (nullable = true)
>      |    |    |    |    |    |-- element: string (containsNull = true)

In [None]:
val splitted_name = file_name.split("/")
 val forum = splitted_name(splitted_name.size-2)
 val corpus = splitted_name(splitted_name.size-1)
 val value = udf((arr: Seq[String]) => arr.mkString(","))
 val tmp_df = df.select(//col("_id") as "corpus_id",
                     col("_id") as "forum_id",
                     col("_title") as "forum_title",
                     flatten(col("thread._id")) as "thread_id",
                     flatten(col("thread._title")) as "thread_title",
                     flatten(col("thread.text.sentence.w")) as "w")
                .withColumn("w", explode($"w"))
               //.groupBy("thread_id")

In [None]:
val tmp2_df = tmp_df.agg(collect_list("w") as "w")

  

>     tmp2_df: org.apache.spark.sql.DataFrame = [thread_id: string, w: array<string>]

In [None]:
import org.apache.spark.sql.functions.{col,concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
import org.apache.spark.sql.types.{ ArrayType, StructType, StructField, StringType, IntegerType }
import com.databricks.spark.xml._ // Add the DataFrame.read.xml() method

val df = spark.read
    .option("rootTag", "forum")
    .option("rowTag", "thread")
    .xml(file_name)//.cache()

In [None]:
val tmpdf = df.select("thread.*")

  

>     tmpdf: org.apache.spark.sql.DataFrame = [_id: string, _title: string ... 2 more fields]

In [None]:
tmpdf.count()

  

>     res31: Long = 44

In [None]:
val df2Flatten = tmpdf.toDF("fname","mename","lname","currAddState",
    "currAddCity","prevAddState","prevAddCity")

In [None]:
val tmp_df = df.select("thread._id")

  

>     tmp_df: org.apache.spark.sql.DataFrame = [_id: string]

In [None]:
df.select(explode($"thread").as("exploded")).select("exploded.*").show

In [None]:
import org.apache.spark.sql.Column
def flattenStructSchema(schema: StructType, prefix: String = null) : Array[Column] = {
    schema.fields.flatMap(f => {
      val columnName = if (prefix == null) f.name else (prefix + "." + f.name)

      f.dataType match {
        case st: StructType => flattenStructSchema(st, columnName)
        case _ => Array(col(columnName).as(columnName.replace(".","_")))
      }
    })
  }

val df3 = df.select(flattenStructSchema(df.schema):_*)
df3.printSchema()
df3.show(false)

  

>     root
>      |-- _id: string (nullable = false)
>      |-- _title: string (nullable = false)
>      |-- _url: string (nullable = false)
>      |-- thread__id: string (nullable = false)
>      |-- thread__title: string (nullable = false)
>      |-- thread__url: string (nullable = false)
>      |-- thread_text_sentence: array (nullable = false)
>      |    |-- element: struct (containsNull = true)
>      |    |    |-- w: array (nullable = true)
>      |    |    |    |-- element: string (containsNull = true)
>
>     +------+----------------------------+-------------------------------------+----------+----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     |_id   |_title                      |_url                                 |thread__id|thread__title                                             |thread__url                                                                                            |thread_text_sentence                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
>     +------+----------------------------+-------------------------------------+----------+----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|50052993  |Vad kostar med adoption?                                  |http://www.familjeliv.se/forum/thread/50052993-vad-kostar-med-adoption                                 |[[[Kostnader, finns, det, oavsett, hur, man, får, barnen, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                              |
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|44708590  |Några frågor om adoption                                  |http://www.familjeliv.se/forum/thread/44708590-nagra-fragor-om-adoption                                |[[[Man, får, själv, välja, vilka, diagnoser, man, kan, tänka, sig, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                     |
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|45589761  |extramamma/kontaktperson/familj                           |http://www.familjeliv.se/forum/thread/45589761-extramammakontaktpersonfamilj                           |[[[Jag, känner, ofta, likadant, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|23672341  |Får man adoptera även om man kan få barn på naturlig väg??|http://www.familjeliv.se/forum/thread/23672341-far-man-adoptera-aven-om-man-kan-fa-barn-pa-naturlig-vag|[[[Men, är, bara, en, liten, del, av, dem, tillgängliga, för, adoption, ...]]]                                                                                                                                                                                                                                                                                                                                                                                                              |
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|11606660  |Fler i skåne som ska/vill adoptera?                       |http://www.familjeliv.se/forum/thread/11606660-fler-i-skane-som-skavill-adoptera                       |[[[Tulipanen, ,, ,, kurs, ,, medlem, i, AC, ,, inget, barn]]]                                                                                                                                                                                                                                                                                                                                                                                                                               |
>     |13-242|Adoption > Intresserad      |http://www.familjeliv.se/forum/13/242|44344822  |Nya kring adoption II                                     |http://www.familjeliv.se/forum/thread/44344822-nya-kring-adoption-ii                                   |[[[Mäh, ,, namnet, är, ju, inte, alls, svenskt, från, början, utan, förmodligen, spanskt, !]]]                                                                                                                                                                                                                                                                                                                                                                                              |
>     |13-243|Adoption > Under utredning  |http://www.familjeliv.se/forum/13/243|15886760  |Att det ska ta sån tid                                    |http://www.familjeliv.se/forum/thread/15886760-att-det-ska-ta-san-tid                                  |[[[Att, adoptera, det, är, ", jobbigt, ", ..., utredning, som, kan, ta, ,, vänta, på, medgivande, ,, fixa, alla, handlingar, ,, vänta, på, BB, ,, längre, handläggningtid, ,, oroligheter, i, landet, ,, ändrade, krav, osv, ..., det, är, inte, lätt, ..., jag, vet, !]]]                                                                                                                                                                                                                  |
>     |13-243|Adoption > Under utredning  |http://www.familjeliv.se/forum/13/243|55784777  |Hemutredning nov 2010                                     |http://www.familjeliv.se/forum/thread/55784777-hemutredning-nov-2010                                   |[[[Att, vi, är, trygga, ,, kärleksfulla, och, att, de, anser, att, vi, kan, ta, emot, ett, adoptivbarn, och, ge, detta, barn, en, bra, uppväxt, osv, .]]]                                                                                                                                                                                                                                                                                                                                   |
>     |13-243|Adoption > Under utredning  |http://www.familjeliv.se/forum/13/243|20561740  |Utredningen startar ordentligt!                           |http://www.familjeliv.se/forum/thread/20561740-utredningen-startar-ordentligt                          |[[[Hur, stort, huset, är, och, vad, det, är, värderat, till, ungefär, ,, och, hur, stor, tomten, är, .]]]                                                                                                                                                                                                                                                                                                                                                                                   |
>     |13-244|Adoption > Väntar barnbesked|http://www.familjeliv.se/forum/13/244|29117832  |Vaccinationer                                             |http://www.familjeliv.se/forum/thread/29117832-vaccinationer                                           |[[[Var, i, kina, och, vaccinerade, mej, endast, med, havrix, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                           |
>     |13-244|Adoption > Väntar barnbesked|http://www.familjeliv.se/forum/13/244|32743367  |Hur många har behållt namnet som tilltalsnam...           |http://www.familjeliv.se/forum/thread/32743367-hur-manga-har-behallt-namnet-som-tilltalsnam            |[[[När, vi, adopterade, vår, son, pratade, vi, om, detta, med, namn, men, både, jag, och, maken, (, som, också, är, adopterad, och, är, nöjd, med, sitt, svenska, namn, ), kände, att, vi, ville, ge, sonen, det, namn, som, kändes, bra, och, fint, .]]]                                                                                                                                                                                                                                   |
>     |13-244|Adoption > Väntar barnbesked|http://www.familjeliv.se/forum/13/244|10884256  |Jaha.....                                                 |http://www.familjeliv.se/forum/thread/10884256-jaha                                                    |[[[hihihi, ,, jamen, vi, räknar, med, också, ,, för, av, erfarenhet, så, tar, det, ofta, allting, än, vad, man, tror, ....., Inte, ska, du, vara, avis, gumman, !]]]                                                                                                                                                                                                                                                                                                                        |
>     |13-245|Adoption > Land             |http://www.familjeliv.se/forum/13/245|21153858  |Etiopien sa nej...                                        |http://www.familjeliv.se/forum/thread/21153858-etiopien-sa-nej                                         |[[[Ta, hand, om, er, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
>     |13-245|Adoption > Land             |http://www.familjeliv.se/forum/13/245|45977432  |Vill veta allt om Etiopien!                               |http://www.familjeliv.se/forum/thread/45977432-vill-veta-allt-om-etiopien                              |[[[Men, det, finns, en, lejonpark, som, barn, ska, gilla, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                              |
>     |13-245|Adoption > Land             |http://www.familjeliv.se/forum/13/245|61448378  |Adoptera från Polen.                                      |http://www.familjeliv.se/forum/thread/61448378-adoptera-fran-polen                                     |[[[Ett, litet, vilt, troll, Du, kan, läsa, lite, i, min, blogg, från, tiden, i, .]]]                                                                                                                                                                                                                                                                                                                                                                                                        |
>     |13-246|Adoption > Hemma igen       |http://www.familjeliv.se/forum/13/246|27055359  |Vem utav er kommer,,,????                                 |http://www.familjeliv.se/forum/thread/27055359-vem-utav-er-kommer                                      |[[[Jag, tycker, att, det, är, viktigt, att, inte, generalisera, barnen, t.ex., genom, att, säga, att, inga, barn, under, 3, har, det, bättre, på, förskolan, än, hemma, .]]]                                                                                                                                                                                                                                                                                                                |
>     |13-246|Adoption > Hemma igen       |http://www.familjeliv.se/forum/13/246|56357664  |Svärmor förstår inte                                      |http://www.familjeliv.se/forum/thread/56357664-svarmor-forstar-inte                                    |[[[Men, det, är, väldigt, skönt, som, nybliven, förälder, att, ha, lite, stöd, också, från, äldre, släktingar, som, kan, svänga, ihop, en, köttgryta, när, orken, hos, en, tryter, eller, få, hjälp, med, dammsugningen, i, köket, ,, eller, få, några, kakor, nybakt, bröd, och, en, spann, med, äpplen, ,, eller, någon, som, kunde, passa, och, hålla, när, man, själv, behövde, gå, på, toa, eller, bara, vara, ensam, ,, när, maken, satt, på, möten, eller, var, på, tjänsteresa, .]]]|
>     |13-247|Adoption > Förälder         |http://www.familjeliv.se/forum/13/247|28577180  |Hur lång föräldraledighet?                                |http://www.familjeliv.se/forum/thread/28577180-hur-lang-foraldraledighet                               |[[[Kanske, öppna, förskolan, ,, kyrkis, ,, kladdis, eller, ngt, vore, något, ?]]]                                                                                                                                                                                                                                                                                                                                                                                                           |
>     |13-247|Adoption > Förälder         |http://www.familjeliv.se/forum/13/247|26816808  |Testamente                                                |http://www.familjeliv.se/forum/thread/26816808-testamente                                              |[[[Det, finns, ändå, inga, garantier, för, att, det, skulle, bli, så, vad, vi, förstått, ,, Nej, ,, för, i, kan, man, inte, testamentera, bort, barn, ,, däremot, kan, man, tillkännage, i, nätverket, och, även, förvara, en, handling, i, barnets, akt, hos, där, det, framgår, hur, man, vill, att, det, skall, ordnas, för, barnet, om, det, händer, onågot, .]]]                                                                                                                       |
>     |13-247|Adoption > Förälder         |http://www.familjeliv.se/forum/13/247|53847781  |Anknytning                                                |http://www.familjeliv.se/forum/thread/53847781-anknytning                                              |[[[Då, har, det, gått, sedan, vi, fick, dem, .]]]                                                                                                                                                                                                                                                                                                                                                                                                                                           |
>     +------+----------------------------+-------------------------------------+----------+----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     only showing top 20 rows
>
>     import org.apache.spark.sql.Column
>     flattenStructSchema: (schema: org.apache.spark.sql.types.StructType, prefix: String)Array[org.apache.spark.sql.Column]
>     df3: org.apache.spark.sql.DataFrame = [_id: string, _title: string ... 5 more fields]

In [None]:
df3.select("thread_text_sentence").show()

  

>     +--------------------+
>     |thread_text_sentence|
>     +--------------------+
>     |[[[Kostnader, fin...|
>     |[[[Man, får, själ...|
>     |[[[Jag, känner, o...|
>     |[[[Men, är, bara,...|
>     |[[[Tulipanen, ,, ...|
>     |[[[Mäh, ,, namnet...|
>     |[[[Att, adoptera,...|
>     |[[[Att, vi, är, t...|
>     |[[[Hur, stort, hu...|
>     |[[[Var, i, kina, ...|
>     |[[[När, vi, adopt...|
>     |[[[hihihi, ,, jam...|
>     |[[[Ta, hand, om, ...|
>     |[[[Men, det, finn...|
>     |[[[Ett, litet, vi...|
>     |[[[Jag, tycker, a...|
>     |[[[Men, det, är, ...|
>     |[[[Kanske, öppna,...|
>     |[[[Det, finns, än...|
>     |[[[Då, har, det, ...|
>     +--------------------+
>     only showing top 20 rows

In [None]:
tmp_df.count()

  

>     res47: Long = 44

In [None]:
tmp_df.show()

  

>     +--------+--------------------+--------------------+--------------------+
>     |     _id|              _title|                _url|                text|
>     +--------+--------------------+--------------------+--------------------+
>     |50052993|Vad kostar med ad...|http://www.familj...|[[[[Kostnader, fi...|
>     |44708590|Några frågor om a...|http://www.familj...|[[[[Man, får, sjä...|
>     |45589761|extramamma/kontak...|http://www.familj...|[[[[Jag, känner, ...|
>     |23672341|Får man adoptera ...|http://www.familj...|[[[[Men, är, bara...|
>     |11606660|Fler i skåne som ...|http://www.familj...|[[[[Tulipanen, ,,...|
>     |44344822|Nya kring adoptio...|http://www.familj...|[[[[Mäh, ,, namne...|
>     |15886760|Att det ska ta så...|http://www.familj...|[[[[Att, adoptera...|
>     |55784777|Hemutredning nov ...|http://www.familj...|[[[[Att, vi, är, ...|
>     |20561740|Utredningen start...|http://www.familj...|[[[[Hur, stort, h...|
>     |29117832|       Vaccinationer|http://www.familj...|[[[[Var, i, kina,...|
>     |32743367|Hur många har beh...|http://www.familj...|[[[[När, vi, adop...|
>     |10884256|           Jaha.....|http://www.familj...|[[[[hihihi, ,, ja...|
>     |21153858|  Etiopien sa nej...|http://www.familj...|[[[[Ta, hand, om,...|
>     |45977432|Vill veta allt om...|http://www.familj...|[[[[Men, det, fin...|
>     |61448378|Adoptera från Polen.|http://www.familj...|[[[[Ett, litet, v...|
>     |27055359|Vem utav er komme...|http://www.familj...|[[[[Jag, tycker, ...|
>     |56357664|Svärmor förstår inte|http://www.familj...|[[[[Men, det, är,...|
>     |28577180|Hur lång föräldra...|http://www.familj...|[[[[Kanske, öppna...|
>     |26816808|          Testamente|http://www.familj...|[[[[Det, finns, ä...|
>     |53847781|          Anknytning|http://www.familj...|[[[[Då, har, det,...|
>     +--------+--------------------+--------------------+--------------------+
>     only showing top 20 rows

In [None]:
import org.apache.spark.sql.functions.{col,concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
import org.apache.spark.sql.types.{ ArrayType, StructType, StructField, StringType, IntegerType }
val temp_df = df.groupBy("_id").agg(collect_list("thread"))

  

>     import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
>     import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType, IntegerType}
>     temp_df: org.apache.spark.sql.DataFrame = [_id: string, collect_list(thread): array<struct<_id:string,_title:string,_url:string,text:struct<sentence:array<struct<w:array<string>>>>>>]

In [None]:
display(temp_df)

In [None]:
df.map(thread -> RowFactory.create(thread.getId(), RowFactory.create(thread.)))

In [None]:
df.select("_id", "thread._id").show(false)

  

>     +------+--------+
>     |_id   |_id     |
>     +------+--------+
>     |13-242|50052993|
>     |13-242|44708590|
>     |13-242|45589761|
>     |13-242|23672341|
>     |13-242|11606660|
>     |13-242|44344822|
>     |13-243|15886760|
>     |13-243|55784777|
>     |13-243|20561740|
>     |13-244|29117832|
>     |13-244|32743367|
>     |13-244|10884256|
>     |13-245|21153858|
>     |13-245|45977432|
>     |13-245|61448378|
>     |13-246|27055359|
>     |13-246|56357664|
>     |13-247|28577180|
>     |13-247|26816808|
>     |13-247|53847781|
>     +------+--------+
>     only showing top 20 rows

In [None]:

import org.apache.spark.sql.types.{ ArrayType, StructType, StructField, StringType }
import com.databricks.spark.xml._ // Add the DataFrame.read.xml() method

def read_xml(file_name: String): org.apache.spark.sql.DataFrame = {
  val word_schema = StructType(Array(
    StructField("_lemma", StringType, nullable=false)))

  val sentence_schema = StructType(Array(
    StructField("w", ArrayType(StringType, true), nullable=true)
  ))
  val text_schema = StructType(Array(
     StructField("sentence", ArrayType(sentence_schema), nullable=false)
  ))

  val thread_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("text", text_schema, nullable=false)
  ))

  val forum_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("_title", StringType, nullable = false),
    StructField("_url", StringType, nullable = false),
    StructField("thread", thread_schema, nullable=false)
  ))

  val corpus_schema = StructType(Array(
    StructField("_id", StringType, nullable = false),
    StructField("forum", forum_schema, nullable=false)
  ))

  spark.read
    .option("rowTag", "forum")
    .schema(forum_schema)
    .xml(file_name)//.cache()
 }

  

>     import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType}
>     import com.databricks.spark.xml._
>     read_xml: (file_name: String)org.apache.spark.sql.DataFrame

  

>     res86: Long = 44

  

>     df_temp: org.apache.spark.sql.DataFrame = [_id: string, _title: string ... 2 more fields]

In [None]:

def get_dataset(file_name: String) : org.apache.spark.sql.DataFrame = {
  val xml_df = read_xml(file_name)
  val splitted_name = file_name.split("/")
  val forum = splitted_name(splitted_name.size-2)
  val corpus = splitted_name(splitted_name.size-1)
  val value = udf((arr: Seq[String]) => arr.mkString(","))
  xml_df.select(//col("_id") as "corpus_id",
                     col("_id") as "forum_id",
                     col("_title") as "forum_title",
                     col("thread._id") as "thread_id",
                     col("thread._title") as "thread_title",
                     flatten(col("thread.text.sentence.w")) as "w")
                .withColumn("w", explode($"w"))
               .groupBy("thread_id")
               .agg(collect_set("thread_title") as "thread_title",
                    collect_list("w") as "w",
                    collect_set("forum_id") as "forum_id",
                    collect_set("forum_title") as "forum_title")
                    //collect_set("corpus_id") as "corpus_id")
               .withColumn("w", value($"w"))
               .withColumn("thread_title", value($"thread_title"))
               .withColumn("forum_id", value($"forum_id"))
               .withColumn("forum_title", value($"forum_title"))
               .withColumn("platform", lit(forum))
               .withColumn("corpus_id", lit(corpus))//($"corpus_id"))
}

  

>     get_dataset: (file_name: String)org.apache.spark.sql.DataFrame

  

>     root
>      |-- thread_id: string (nullable = false)
>      |-- thread_title: string (nullable = true)
>      |-- w: string (nullable = true)
>      |-- forum_id: string (nullable = true)
>      |-- forum_title: string (nullable = true)
>      |-- platform: string (nullable = false)
>      |-- corpus_id: string (nullable = false)

In [None]:
val file_name = "dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml"
val df = get_dataset(file_name)
val file_name2 = "dbfs:/datasets/student-project-01/flashback/flashback-droger.xml"
val df2 = get_dataset(file_name2)

  

>     file_name: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-adoption.xml
>     df: org.apache.spark.sql.DataFrame = [thread_id: string, thread_title: string ... 5 more fields]
>     file_name2: String = dbfs:/datasets/student-project-01/flashback/flashback-droger.xml
>     df2: org.apache.spark.sql.DataFrame = [thread_id: string, thread_title: string ... 5 more fields]

In [None]:
val df_merged = df.unionAll(df2).cache()

  

>     df_merged: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [thread_id: string, thread_title: string ... 5 more fields]

In [None]:
df_merged.printSchema()

  

>     root
>      |-- thread_id: string (nullable = false)
>      |-- thread_title: string (nullable = true)
>      |-- w: string (nullable = true)
>      |-- forum_id: string (nullable = true)
>      |-- forum_title: string (nullable = true)
>      |-- platform: string (nullable = false)
>      |-- corpus_id: string (nullable = false)

In [None]:
df_merged.show(1)

  

>     +---------+--------------------+--------------------+--------+--------------------+----------+--------------------+
>     |thread_id|        thread_title|                   w|forum_id|         forum_title|  platform|           corpus_id|
>     +---------+--------------------+--------------------+--------+--------------------+----------+--------------------+
>     | 74178773|I tankar om adopt...|Så,här,i,efterhan...|  13-392|Adoption > Hjälp/Råd|familjeliv|familjeliv-adopti...|
>     +---------+--------------------+--------------------+--------+--------------------+----------+--------------------+
>     only showing top 1 row

In [None]:
display(df_merged)

  

[TABLE]

Truncated to 30 rows

In [None]:
df_merged.where(col("forum_id") === 13 && col("platform") === "flashback").select("w")

  

>     res53: org.apache.spark.sql.DataFrame = [w: string]

In [None]:
df2.printSchema()

  

>     root
>      |-- thread_id: string (nullable = false)
>      |-- thread_title: string (nullable = true)
>      |-- w: string (nullable = true)
>      |-- forum_id: string (nullable = true)
>      |-- forum_title: string (nullable = true)
>      |-- platform: string (nullable = false)
>      |-- corpus_id: string (nullable = false)

In [None]:
file_name.split("/").last

  

>     res26: String = familjeliv-adoption.xml

In [None]:
df

  

>     res27: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_id: string, forum: struct<_id: string, _title: string ... 2 more fields>]

In [None]:
display(df)

  

[TABLE]

Truncated to 30 rows

In [None]:
import org.apache.spark.sql.functions.{col,concat_ws, udf, flatten, explode, collect_list, collect_set, lit}

// TOOD: The sentences are not ordered. Do we want to order it? If so, we need to collect data and time from the xml file as well (datefrom, timefrom), merge them, convert to Datatime, and sort
val value = udf((arr: Seq[String]) => arr.mkString(","))
val df_2 = df.select(col("_id") as "corpus_id",
                     col("forum._id") as "forum_id",
                     col("forum._title") as "forum_title",
                     col("forum.thread._id") as "thread_id",
                     col("forum.thread._title") as "thread_title",
                     flatten(col("forum.thread.text.sentence.w")) as "sentence")
                .withColumn("sentence", explode($"sentence"))
                .groupBy("thread_id")
                .agg(collect_set("thread_title") as "thread_title",
                    collect_list("sentence") as "sentences",
                    collect_set("forum_id") as "forum_id",
                    collect_set("forum_title") as "forum_title",
                    collect_set("corpus_id") as "corpus_id")
                .withColumn("sentences", value($"sentences"))
                .withColumn("thread_title", value($"thread_title"))
                .withColumn("forum_id", value($"forum_id"))
                .withColumn("forum_title", value($"forum_title"))
                .withColumn("platform", lit("familjeliv"))
                .withColumn("corpus_id", value($"corpus_id")).cache()

  

>     import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
>     value: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$9249/359357668@231c7caf,StringType,List(Some(class[value[0]: array<string>])),None,true,true)
>     df_2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [thread_id: string, thread_title: string ... 5 more fields]

In [None]:
df_2.where("thread_id = 29117832").show(false)

In [None]:
df_2.printSchema()

  

>     root
>      |-- thread_id: string (nullable = false)
>      |-- thread_title: string (nullable = true)
>      |-- sentences: string (nullable = true)
>      |-- forum_id: string (nullable = true)
>      |-- forum_title: string (nullable = true)
>      |-- corpus_id: string (nullable = true)
>      |-- platform: string (nullable = false)

In [None]:
df_2.show(1)

In [None]:
display(df_2)