The two cultures - Classifying threads with logistic regression
===============================================================

In [None]:
//Imports
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

  

>     import org.apache.spark.ml.feature.StopWordsRemover
>     import org.apache.spark.ml.feature.RegexTokenizer
>     import org.apache.spark.ml.classification.LogisticRegression
>     import org.apache.spark.ml.feature.CountVectorizer
>     import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

  

1. Load the data
----------------

In [None]:
/scalable-data-science/000_0-sds-3-x-projects/student-project-01_group-TheTwoCultures/01_load_data

  

  

>     import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit}
>     import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType, IntegerType}
>     import com.databricks.spark.xml._
>     import org.apache.spark.sql.functions._
>     read_xml: (file_name: String)org.apache.spark.sql.DataFrame
>     get_dataset: (file_name: String)org.apache.spark.sql.DataFrame

  

>     save_df: (df: org.apache.spark.sql.DataFrame, filePath: String)Unit
>     load_df: (filePath: String)org.apache.spark.sql.DataFrame
>     no_forums: (df: org.apache.spark.sql.DataFrame)Long

  

>     dbfs:/datasets/student-project-01/flashback/familjeliv-allmanna-ekonomi_df
>     familjeliv-allmanna-ekonomi_df already exists!
>     dbfs:/datasets/student-project-01/flashback/familjeliv-sexsamlevnad_df
>     familjeliv-sexsamlevnad_df already exists!
>     dbfs:/datasets/student-project-01/flashback/flashback-ekonomi_df
>     flashback-ekonomi_df already exists!
>     dbfs:/datasets/student-project-01/flashback/flashback-sex_df
>     flashback-sex_df already exists!
>     fl_root: String = dbfs:/datasets/student-project-01/familjeliv/
>     fb_root: String = dbfs:/datasets/student-project-01/flashback/
>     fl_data: Array[String] = Array(familjeliv-allmanna-ekonomi, familjeliv-sexsamlevnad)
>     fb_data: Array[String] = Array(flashback-ekonomi, flashback-sex)

In [None]:
//Load dataframes
val file_path_familjeliv = "dbfs:/datasets/student-project-01/familjeliv/familjeliv-sexsamlevnad_df"
val file_path_flashback = "dbfs:/datasets/student-project-01/flashback/flashback-sex_df"
val df_familjeliv = load_df(file_path_familjeliv)
val df_flashback = load_df(file_path_flashback)

  

>     file_path_familjeliv: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-sexsamlevnad_df
>     file_path_flashback: String = dbfs:/datasets/student-project-01/flashback/flashback-sex_df
>     df_familjeliv: org.apache.spark.sql.DataFrame = [thread_id: string, thread_title: string ... 5 more fields]
>     df_flashback: org.apache.spark.sql.DataFrame = [thread_id: string, thread_title: string ... 5 more fields]

In [None]:
//Extract the text
val df_text_flashback = df_flashback.select("w")
val df_text_familjeliv = df_familjeliv.select("w")


  

>     df_text_flashback: org.apache.spark.sql.DataFrame = [w: string]
>     df_text_familjeliv: org.apache.spark.sql.DataFrame = [w: string]

  

2. Add labels
-------------

In [None]:
//Add label columns
val df_text_flashback_c = df_text_flashback.withColumn("c", lit(0))
val df_text_familjeliv_c = df_text_familjeliv.orderBy(rand()).limit(df_text_flashback_c.count().toInt).withColumn("c", lit(1))
val df_text_full = df_text_flashback_c.union(df_text_familjeliv_c)
println(df_text_flashback_c.count())
println(df_text_familjeliv_c.count())
println(df_text_full.count())

  

>     56621
>     56621
>     113242
>     df_text_flashback_c: org.apache.spark.sql.DataFrame = [w: string, c: int]
>     df_text_familjeliv_c: org.apache.spark.sql.DataFrame = [w: string, c: int]
>     df_text_full: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [w: string, c: int]

  

3. Extract single words
-----------------------

In [None]:
val tokenizer = new RegexTokenizer()
  .setPattern("(?U),") // break by whitespace
  .setMinTokenLength(5) // Filter away tokens with length < 5
  .setInputCol("w") // name of the input column
  .setOutputCol("text") // name of the output column
val tokenized_df = tokenizer.transform(df_text_full).select("c", "text")
tokenized_df.show(3, false)

  

>     +---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     |c  |text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
>     +---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     |0  |[denna, verklighet, överlag, jävligt, sunkig, flesta, regioner, umgås, präktiga, radikalfeminister, svart, något, problem, tvåan, aldrig, stött, begåvad, autofokus, sunkigt, amatörmässigt, perfabild, visar, pratar, självklart, hittar, aldrig, varit, hemma, sådär, nästan, kameran, många, amatörbilder, också, studenter, jävla, flesta, studenterna, tyvärr, brudarna, undantag, medges, whats, kvistar, klagar, färgen, behöver, sunkig, eller, stenad, tvåan, själv, tvekat, sekund, trycka, foton, tagna, någon, slags, semiprofessionell, miljö, plockar, undan, distrahera, inget, konstigt, balkanbrud, ligger, kulle, naken, bakgrunden, hennes, sönderknullade, gjorde, sunkigare, desto, större, chans, vettu, borta, troligt, större, chans, hittade, inget, sunkigt, sunkiga, lägenheter, u-landsatmosfärer, själv, alltid, psykiskt, dåligt, varje, liknande, bilder, någon, anledning, ....., gjorde, uppstötningar, tjejer, brains, kinky, geomsnittet, märkt, duktigt, rumpan, hoppa, sunkigt, skara, dagens, amatörfynd, hitta, baltikum-brudar, tänkt, merparten, amatörbilder, tagna, miljöer, påminner, knarkarkvartar, håller, myntade, uttrycket, intelligens, sexigt, visste, pratade, sunkigt, kaklet, runkade, trodde, knarkarkvart, ligger, ryggsäck, golvet, eller, sängen, obäddad, pornografi, skickar, kunde, varit, tagna, studentkorridor, allihop, vattnet, verkar, sunkigt, liite, galning, kolla, färgen, vattnet, inget, bilden, liksom, möjligtvis, håriga, ölmagen, stödjer, annars, enkelt, porrskadade, tycker, dåligt, kolla, denna, hellre, sådana, bilder, amerikansk, proffsporr, beror, förklaring, sinnessvaga, individer, lägger, bilder, själv, nätet, dessa, saknar, utbildning, pengar, möjlighet, annat, sunkiga, lägenheter, någonsin, knaarkarkvart, ingen, fylla, kaklet, stenad, minsann, fenomenet, kallas, verklighet, sorts, badolja, sådär, brukar, äldre, bostads, eller, hyresrätt, exempel]                                                                                                                                                                    |
>     |0  |[också, letat, efter, bilder, lilla, stycket, verkar, finnas, mycket, hoppas, komma, medlem, någon, hitta, bilder, vackra, varelsen, längst, sidan, skäggbiff, iallafall, boobs]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
>     |0  |[finns, troll, skogen, kommer, hamtar, stygga, ljuger, hatar, övriiiigt, lille, raggot, pratar, south, huvet, detta, operationsköerna, långa, möjligt, pressa, hamster, röven, sönder, hamster, tännka, gerbil, stoppat, hamster, prutten, redan, sagts, southpark, bögar, nämligen, fullständigt, analfixerade, killen, röret, däggdjur, arslet, råkade, vilket, genast, ledde, gerbilen, rostad, tillsammans, andre, killens, ansikte, bekvämt, tarmen, stackars, korvryttaren, diskutera, djursex, andra, trådar, exalt, bögar, operera, fastnat, alltså, ökenråttor, hamstar, bögar, stoppar, hamstrar, röven, finns, någon, stoppa, gerbiler, analen, skönt, indikerar, detta, rätta, klart, annan, sexuell, läggning, homosexualitet, historierna, detta, längre, tillbaka, starten, mongo, tycker, låter, sjukt, faktist, klättrade, visst, vägrade, komma, veterligen, mycket, runka, bulle, fastna, kuken, dammsugarröret, gällde, stimulans, finns, hundratals, bättre, finns, säkert, flertalet, inspiration, faktiskt, hamster, prutten, varför, detta, skulle, indikera, homosexualitet, undgår, operera, röven, vanligt, höftledsbyten, snart, börjar, operera, begagnade, höftben, röven, pederastläkarna, också, verkar, ganska, huvudet, finns, filmklipp, bilder, myter, herregud, andra, fjollan, tände, tändsticka, framför, mynningen, röret, lågan, skulle, locka, lille, rackaren, aldrig, talas, bögar, hörde, historia, bögar, roade, trycka, diverse, storlekar, varandras, bajsbottnar, förmodan, enstaka, idiot, gjort, detta, knappast, något, bögar, grupp, regelbundet, sedan, kläckte, alldeles, särdeles, smart, hämtade, deras, gerbil, klättra, röret, medans, stack, sketan, andre, intressanta, speciellt, veganbögarna, storkonsumenter, analhamster, dessutom, faktiskt, tända, andra, samma, avsnitt, favoritkaraktär, catatafish, fungerade, dåligt, äckligt, rekomenderas, detta, kanske, detta, stoppade, råtta, fittan, toppen, svansen, stack, stoppar, frysen, först, hamnar, dvala, trycker, röven, kviknar, hamstern, bögen, orgasm, kunna, pressa, hamster, trongt, litet, rövhål]|
>     +---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>     only showing top 3 rows
>
>     tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_a57eaee3101f, minTokenLength=5, gaps=true, pattern=(?U),, toLowercase=true
>     tokenized_df: org.apache.spark.sql.DataFrame = [c: int, text: array<string>]

  

4. Remove stopwords
-------------------

In [None]:
//Stopwordsremover (similar to lda notebook)
val stoppord = sc.textFile("dbfs:/datasets/student-project-01/stoppord.csv").collect()
val stopwordList = Array("bara","lite","finns","vill","samt","inga","även","finns","ganska","också","igen","just","that","with","http","jpg",  "kanske","tycker","gillar","bra","000","måste","tjej","tjejer","tjejen","tjejerna","kvinna","kvinnor","kille","killar","killen","män","rätt","män","com","and","html","många","aldrig","www","mpg","avi","wmv","riktigt","känner","väldigt","font","size","mms","2008","2009", "flashback", "familjeliv").union(stoppord).union(StopWordsRemover.loadDefaultStopWords("swedish"))

val remover = new StopWordsRemover()
  .setStopWords(stopwordList)
  .setInputCol("text")
  .setOutputCol("filtered")


  

>     stoppord: Array[String] = Array(word, all, alla, allas, allt, alltså, andra, andras, annan, annat, artonde, artonn, att, av, bakom, bara, bland, blev, bli, blir, blivit, båda, bådas, både, dag, dagar, dagarna, dagen, de, del, delen, dem, den, denna, deras, dess, dessa, det, detta, dig, din, dina, dit, ditt, dock, dom, du, där, därför, då, efter, eftersom, ej, elfte, eller, elva, en, er, era, ert, ett, fall, fanns, fast, fem, femte, femtio, femtionde, femton, femtonde, fick, fin, finnas, finns, fjorton, fjortonde, fjärde, fler, flera, flesta, fram, framför, från, fyra, fyrtio, fyrtionde, få, får, fått, för, före, förra, första, ge, genom, ger, gick, gjorde, gjort, gälla, gäller, gällt, gärna, gå, gång, går, gått, gör, göra, ha, hade, haft, han, hans, har, hela, heller, hellre, helst, helt, henne, hennes, heter, hit, hjälp, hon, honom, hundra, hundraen, hundraett, hur, här, i, ibland, icke, igen, in, inför, inga, ingen, inget, innan, inne, inom, inte, inuti, ja, jag, ju, jämfört, kan, kom, komma, kommer, kommit, kunde, kunna, kunnat, kvar, kör, legat, ligga, ligger, lägga, man, med, mellan, men, menar, mer, mera, mig, min, mina, mitt, mittemot, mot, ned, nederst, nedersta, nedre, nej, ner, ni, nio, nionde, nittio, nittionde, nitton, nittonde, nog, noll, nr, nu, nummer, nya, när, nästa, någon, någonting, något, några, nån, nåt, och, också, om, oss, på, rakt, redan, sade, sagt, samt, sedan, sen, ser, sex, sextio, sextionde, sexton, sextonde, sig, sin, sina, sist, sista, siste, sitt, sitta, sju, sjunde, sjuttio, sjuttionde, sjutton, sjuttonde, själv, sjätte, ska, skall, skriver, skulle, slutligen, snart, som, står, säga, säger, sätt, så, sådan, sådana, sådant, ta, tack, tar, till, tills, tio, tionde, tjugo, tjugoen, tjugoett, tjugonde, tjugotre, tjugotvå, tjungo, tog, tolfte, tolv, tre, tredje, trettio, trettionde, tretton, trettonde, tro, tror, två, tvåhundra, under, upp, ur, ut, utan, utanför, ute, vad, var, vara, varför, varifrån, varit, varje, varken, vars, varsågod, vart, vem, vems, verkligen, vet, vi, vid, vidare, vilka, vilkas, vilken, vilket, vill, visst, väl, värre, vår, våra, vårt, än, ändå, ännu, är, även, året, åt, åtminstone, åtta, åttio, åttionde, åttonde, över, övre)
>     stopwordList: Array[String] = Array(bara, lite, finns, vill, samt, inga, även, finns, ganska, också, igen, just, that, with, http, jpg, kanske, tycker, gillar, bra, 000, måste, tjej, tjejer, tjejen, tjejerna, kvinna, kvinnor, kille, killar, killen, män, rätt, män, com, and, html, många, aldrig, www, mpg, avi, wmv, riktigt, känner, väldigt, font, size, mms, 2008, 2009, flashback, familjeliv, word, all, alla, allas, allt, alltså, andra, andras, annan, annat, artonde, artonn, att, av, bakom, bara, bland, blev, bli, blir, blivit, båda, bådas, både, dag, dagar, dagarna, dagen, de, del, delen, dem, den, denna, deras, dess, dessa, det, detta, dig, din, dina, dit, ditt, dock, dom, du, där, därför, då, efter, eftersom, ej, elfte, eller, elva, en, er, era, ert, ett, fall, fanns, fast, fem, femte, femtio, femtionde, femton, femtonde, fick, fin, finnas, finns, fjorton, fjortonde, fjärde, fler, flera, flesta, fram, framför, från, fyra, fyrtio, fyrtionde, få, får, fått, för, före, förra, första, ge, genom, ger, gick, gjorde, gjort, gälla, gäller, gällt, gärna, gå, gång, går, gått, gör, göra, ha, hade, haft, han, hans, har, hela, heller, hellre, helst, helt, henne, hennes, heter, hit, hjälp, hon, honom, hundra, hundraen, hundraett, hur, här, i, ibland, icke, igen, in, inför, inga, ingen, inget, innan, inne, inom, inte, inuti, ja, jag, ju, jämfört, kan, kom, komma, kommer, kommit, kunde, kunna, kunnat, kvar, kör, legat, ligga, ligger, lägga, man, med, mellan, men, menar, mer, mera, mig, min, mina, mitt, mittemot, mot, ned, nederst, nedersta, nedre, nej, ner, ni, nio, nionde, nittio, nittionde, nitton, nittonde, nog, noll, nr, nu, nummer, nya, när, nästa, någon, någonting, något, några, nån, nåt, och, också, om, oss, på, rakt, redan, sade, sagt, samt, sedan, sen, ser, sex, sextio, sextionde, sexton, sextonde, sig, sin, sina, sist, sista, siste, sitt, sitta, sju, sjunde, sjuttio, sjuttionde, sjutton, sjuttonde, själv, sjätte, ska, skall, skriver, skulle, slutligen, snart, som, står, säga, säger, sätt, så, sådan, sådana, sådant, ta, tack, tar, till, tills, tio, tionde, tjugo, tjugoen, tjugoett, tjugonde, tjugotre, tjugotvå, tjungo, tog, tolfte, tolv, tre, tredje, trettio, trettionde, tretton, trettonde, tro, tror, två, tvåhundra, under, upp, ur, ut, utan, utanför, ute, vad, var, vara, varför, varifrån, varit, varje, varken, vars, varsågod, vart, vem, vems, verkligen, vet, vi, vid, vidare, vilka, vilkas, vilken, vilket, vill, visst, väl, värre, vår, våra, vårt, än, ändå, ännu, är, även, året, åt, åtminstone, åtta, åttio, åttionde, åttonde, över, övre, och, det, att, i, en, jag, hon, som, han, på, den, med, var, sig, för, så, till, är, men, ett, om, hade, de, av, icke, mig, du, henne, då, sin, nu, har, inte, hans, honom, skulle, hennes, där, min, man, ej, vid, kunde, något, från, ut, när, efter, upp, vi, dem, vara, vad, över, än, dig, kan, sina, här, ha, mot, alla, under, någon, eller, allt, mycket, sedan, ju, denna, själv, detta, åt, utan, varit, hur, ingen, mitt, ni, bli, blev, oss, din, dessa, några, deras, blir, mina, samma, vilken, er, sådan, vår, blivit, dess, inom, mellan, sådant, varför, varje, vilka, ditt, vem, vilket, sitta, sådana, vart, dina, vars, vårt, våra, ert, era, vilkas)
>     remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_ed5b70a6f000, numStopWords=497, locale=en, caseSensitive=false

In [None]:
val removed_df = remover.transform(tokenized_df).select("c", "filtered")

  

>     removed_df: org.apache.spark.sql.DataFrame = [c: int, filtered: array<string>]

  

5. Count words and create vocabulary vector
-------------------------------------------

In [None]:
val vectorizerall = new CountVectorizer()
   .setInputCol("filtered")
   .setOutputCol("features")
   .setMinDF(5)
   .fit(removed_df) // returns CountVectorizerModel

val vectorizer = new CountVectorizer()
   .setInputCol("filtered")
   .setOutputCol("features")
   .setVocabSize(1000) // Size of dictonary
   .setMinDF(5)
   .fit(removed_df) // returns CountVectorizerModel


  

>     vectorizerall: org.apache.spark.ml.feature.CountVectorizerModel = CountVectorizerModel: uid=cntVec_e9a01f8ad0fe, vocabularySize=129204
>     vectorizer: org.apache.spark.ml.feature.CountVectorizerModel = CountVectorizerModel: uid=cntVec_9d51ff227f19, vocabularySize=1000

In [None]:
vectorizer.vocabulary

  

>     res9: Array[String] = Array(alltid, känns, bättre, faktiskt, verkar, problem, behöver, skrev, olika, precis, svårt, fråga, håller, låter, saker, förhållande, brukar, förstår, tänker, varandra, älskar, borde, skönt, partner, tillsammans, tänka, vissa, länge, vilja, ville, klart, känna, sambo, enkelt, annars, egentligen, människor, prata, hålla, tråden, istället, mindre, direkt, hoppas, knulla, längre, anonym, säkert, jävla, följande, liten, dessutom, tänder, handlar, nästan, hitta, börjar, själva, absolut, stora, orgasm, liksom, däremot, bästa, större, sitter, sängen, tyvärr, fortfarande, funkar, person, tillbaka, otrogen, dåligt, tidigare, händer, börja, massa, livet, vanligt, inlägg, bröst, kolla, hemma, mannen, först, långt, berätta, undrar, försöka, kropp, beror, testa, började, oftast, kuken, bilder, frågan, försöker, trots, speciellt, känslor, självklart, sluta, liknande, enligt, konstigt, gravid, tycka, sidan, tyckte, relation, ifrån, början, tänkte, problemet, gånger, spelar, kvinnan, sexet, trodde, lycka, förstå, ....., gången, analsex, otrohet, pratar, jobbigt, äldre, anser, sjukt, snygg, använda, kondom, vänner, oavsett, sätta, sexuella, pengar, fitta, flickvän, visar, sexuellt, igång, skriva, skillnad, lever, träffa, kroppen, intresserad, kände, sexliv, stort, intressant, använder, ungefär, fattar, titta, prova, kärlek, erfarenhet, hittar, kompis, fungerar, normalt, väljer, munnen, runka, snarare, personen, svara, extra, jävligt, otroligt, träffat, enbart, skull, slicka, tagit, tanken, svenska, oralsex, efteråt, särskilt, människa, pojkvän, viktigt, ingenting, exakt, samlag, yngre, personer, knullar, knappast, tanke, betyder, säker, anledning, övrigt, välja, lättare, medan, sugen, barnen, tänkt, endast, såklart, lägger, vågar, visste, tydligen, ålder, tittar, saken, vänta, ollon, mamma, snabbt, fortsätta, möjligt, behov, kontakt, föredrar, behöva, minst, frågar, dålig, ensam, gammal, jobbet, äckligt, lämna, givetvis, filmer, fittan, träffar, igenom, lilla, snygga, hjälpa, skaffa, vanliga, huvudet, försök, sällan, njuta, antar, orkar, kollar, förklara, stämmer, personligen, sätter, suger, världen, söker, ifall, krogen, förhållandet, skrivit, förmodligen, situation, ställa, roligt, exempel, jobbar, alldeles, antagligen, trevligt, träffade, nätet, oskuld, njutning, antingen, brudar, tankar, sexuell, gamla, barnet, nyfiken, testat, frågade, tråkigt, knappt, njuter, extremt, läser, ansiktet, runkar, riktig, kändes, underbart, trött, klarar, fantasier, upphetsande, börjat, sexigt, räcker, senare, hände, förhållanden, totalt, iallafall, ordentligt, ställer, fullt, förut, vägen, ledsen, pratat, tiden, känslan, passar, sättet, kalla, vanlig, förutom, minns, hända, tvärtom, försökt, känsla, innebär, någonsin, plötsligt, pappa, tillräckligt, önskar, naturligt, filmen, röven, aning, sperma, chans, benen, sådär, annorlunda, risken, jobba, penis, värsta, hahaha, härligt, kring, sexualitet, p-piller, föräldrar, mesta, spännande, frågor, hittade, resten, massor, naturligtvis, magen, kompisar, bilden, trådar, saknar, handen, ärligt, provat, trevlig, luktar, betala, erfarenheter, hjälper, helvete, situationen, sexlust, dildo, ansvar, läste, naken, intresse, hittat, acceptera, köper, sambon, utseende, fundera, snacka, lusten, glidmedel, seriöst, medans, normal, slipper, gilla, förstås, litet, stånd, märker, största, onanerar, delar, kukar, familj, perfekt, betydligt, brösten, lyckas, långa, slutade, velat, sexig, kännas, snälla, slags, fantasi, beroende, kläder, ovanligt, såhär, internet, åsikter, princip, funderar, jätte, definitivt, forum, självförtroende, bakifrån, slutar, t.ex., försökte, sidor, hänger, öppet, polare, menade, upphetsad, rumpan, svarar, beteende, partners, killarna, männen, onanera, svaret, plats, talar, åsikt, verkligheten, klitoris, ärlig, tända, borta, antal, lugnt, umgås, prostitution, väntar, avtändande, berättar, nuvarande, vuxna, ämnet, träffas, respekt, svårare, riktiga, bilderna, rumpa, älska, släppa, alternativ, dotter, relationer, klara, fingrar, överhuvudtaget, pinsamt, gifta, ......, diskutera, orolig, trekant, minsta, berättade, grund, närhet, funka, öppna, fallet, pratade, anledningen, svensk, skiter, satte, spruta, osäker, någonstans, orgasmer, fortsätter, förspel, kvinnorna, spela, grejen, omöjligt, skicka, uppenbarligen, finner, sexlivet, öppen, porrfilm, slickar, visade, stället, prostituerade, allra, ställe, sover, sälja, generellt, inser, smeka, stoppa, omkring, kallar, grymt, slappna, skillnaden, oerhört, våldtäkt, upplever, rejält, talat, ryggen, skönare, vacker, homosexuella, håret, ändra, partnern, samhället, tackar, snackar, likadant, tvungen, lagom, initiativ, åldern, ställen, storlek, gillade, avsugning, chansen, stark, hatar, dåliga, krävs, sämre, sexiga, huvud, tydligt, gratis, därmed, fysiskt, knullat, onani, beskriver, förresten, farligt, trist, snyggt, bestämma, lyckats, gränsen, relationen, duschen, ringa, uppskattar, faktum, allvar, underbar, hållet, vatten, ansikte, slippa, trycka, månad, snäll, räknas, världens, rakar, himla, manliga, vuxen, mysigt, bögar, leder, ögonen, träna, ordet, hemskt, närheten, datorn, kvinnliga, förslag, kräver, rätta, horor, träffades, förstått, säljer, sanningen, nämligen, fungera, mensen, otrogna, fruktansvärt, älskare, högre, skada, utlösning, porren, trosor, klipp, behövs, desto, lider, normala, trivs, funderat, fantiserar, ragga, märkt, oftare, svarade, främst, allmänt, vackra, lycklig, erkänna, stanna, läkare, soffan, fatta, ursäkt, andre, undra, kallas, råkar, storleken, piller, längden, ångrar, överens, varann, dominant, tillägga, finger, svart, smakar, bestämmer, frågat, värld, senaste, bestämt, månader, analt, bruden, ställningar, försvinner, betalar, faller, medel, taget, rolig, hamnar, jäkla, löser, ljuger, kostar, flickor, upplevt, tacksam, räkna, nakna, bekräftelse, starkt, slutat, attraktiv, lämnar, herregud, uppleva, skolan, dagens, skäms, påstå, dörren, sorts, kollade, positivt, lyssna, forumet, oskulden, stackars, undan, flytta, använt, reagerar, ollonet, knull, överallt, typen, medveten, enormt, troligen, alkohol, jättebra, inlägget, kunder, halsen, angående, fantastiskt, påstår, visserligen, djupt, körde, smeker, personlighet, konstig, undvika, ställning, obehagligt, samvete, läggning, relativt, passa, hormoner, smärta, garanterat, kroppar, älskade, bredvid, massage, berättat, knullade, mening, allvarligt, satsen, äcklig, utifrån, tvätta, missionären, letar, ångest, beslut, fullständigt, snopp, tidigt, hårdare, historia, uppmärksamhet, exempelvis, diverse, seriös, följa, sprutar, negativt, flicka, könet, grattis, tvinga, betalt, länder, skiten, övergrepp, använd, sorry, valde, förbi, rummet, märkligt, upplevelse, sjuka, hinner, akten, fingrarna, lovar, diskussion, kommentarer, gångerna, besviken, förtjänar, tunga, attraherad, påverkar, tillfälle, därifrån, fantisera, smaken, olagligt, argument, kollat, bryta, läget, hänga, förhoppningsvis, pungen, betydelse, möjlighet, trycker, smala, underlivet, graviditet, förstod, förvånad, detsamma, viktigaste, tillfället, glömma, tyckt, fötter, funkade, hantera, nervös, inställning, skämt, skylla, orgasmen, ursäkta, uppfattning, svarta, tjock, attraktion, svartsjuk, misstag, uppenbart, meningen, psykiskt, beter, självkänsla, verkade, duger, framförallt, jämföra, tungan, människan, smisk, starka, majoriteten, intresserade, isåfall, roliga, samband, spiral, ringer, sönder, idiot, lösning, ljuga, lagen, vafan, allting, svamp, naturliga, viagra, muskler, länken, låtsas, närmare, kondomer, pengarna, köpte, försiktigt, dricka, troligtvis, förrän, smart, stolt, förlåt, slapp, samhälle, finna, stycken, händerna, homosexualitet, korta, slutet, slidan, förlossningen, skratta, hårda, manlig, omkrets, skydd, polisen, talas, analen, skiljer, handla, fittor, litar, privat, vidrigt, resultat, skapa, lyckades, anses, besök, gissa, damer, numera, varenda, tecken, utmärkt, äckliga, uppskatta, följer, högst, styra, visat, vecka, anledningar, kärleken, kläderna, behövde, rädda, härlig, poäng, vaknar, länkar, porrfilmer, skrattar, tillhör, viktig, syster, gränser, sexlusten, skriv, sårad, undantag, starta, former, sköna, skämmas, kvinnlig, denne, trevliga, tillstånd, hörde, därefter, snubbe, kvinnans, ytterligare, maken, underkläder, dumpa, äktenskap, tappar, snoppen, bjuda, snyggare, kvinnors, ringde, vänder, tittade, svälja, möjligen, galen, situationer, förlora, underbara, slickad, kopparspiral, beredd, pröva, vända, tjänster, äntligen, bevis, tråkig, vägrar, kramar, vårat, övriga, rekommendera, bodde, lagligt, känslig, handlade, märkte, mannens, perioder, längtar, topic, punkt, längd, släpper, trång, stannar, allmänhet, hjärnan, prostituerad, bilen)

In [None]:
// Count the word frequencies
val tf = vectorizer.transform(removed_df.select("c", "filtered")).select("c", "features").cache()

//Print the feature vector to show what it looks like
tf.take(1).foreach(println)


  

>     [0,(1000,[0,4,5,6,12,16,33,34,48,53,56,60,64,66,68,73,83,84,91,100,101,105,107,119,123,125,127,141,143,163,171,201,205,210,212,261,273,302,325,338,341,348,361,367,383,414,424,453,454,491,571,621,632,635,667,684,693,701,829,849,933,981],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])]
>     tf: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [c: int, features: vector]

  

6. Split data into training and test data
-----------------------------------------

In [None]:
//Train test split
val random_order = tf.orderBy(rand())
val splits = random_order.randomSplit(Array(0.8, 0.2), seed = 1337)
val training = splits(0)
val test = splits(1)
println(training.count())
println(test.count())

  

>     90818
>     22424
>     random_order: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [c: int, features: vector]
>     splits: Array[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = Array([c: int, features: vector], [c: int, features: vector])
>     training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [c: int, features: vector]
>     test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [c: int, features: vector]

In [None]:
display(training)

  

7. Logistic Regression model
----------------------------

$$P(y = 1) = \\frac{1}{1+exp(-\\beta X^T)}$$

$$ X = \[1,x*1, \\dots, x*m\], \\quad \\beta = \[\\beta*0, \\beta*1,
..., \\beta\_m\] $$

where $x\_i$ is occurrence for word $i$, $m$ is 1000.

In [None]:
//Logistic regression
val lr = new LogisticRegression()
  .setLabelCol("c")
  .setMaxIter(100)
  .setRegParam(0.0001)
  .setElasticNetParam(0.5)

// Fit the model
val lrModel = lr.fit(training)

// Print the coefficients and intercept for logistic regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")



  

>     Coefficients: [0.01626027783272468,0.06411014289793752,-0.010413301058514585,0.004165405713075699,-0.07048964448025209,-0.015901350185208666,0.06813746654503842,0.26650961492009356,0.15504856857684013,0.07739134105185001,0.06351035757395464,0.011902445003324573,0.030120031962926164,-0.01865779447165648,-0.027030015999884603,-0.027137298959300872,5.562429679097139E-4,0.15874836723661231,0.051873296733211824,0.02348726764012828,0.1383957235351224,-0.09893886139104018,0.09136980465606942,0.062430029022868254,0.016807300294024868,-0.06100652986528558,-0.026273305678219398,0.029908800975990756,0.19676278236503597,0.07517788683873178,-0.004501661507256157,0.013160360392576233,0.8432880748567715,-0.20288468036409685,-0.07956037125704593,-0.015999796643844093,0.0769894597431673,0.11714076637995219,-0.05371190427432788,-0.12276501892007864,-0.07826495374901783,-0.037215652350536046,-0.05935877598983228,0.15026548376424254,-0.28347282937751045,0.0387769774130537,1.8647655096872564,0.03876333483746266,-0.7928610765429637,0.36555336550915946,-0.03133449519145743,-0.08019984170059336,-0.00896455072556092,-0.005879669080580511,0.005179474402830033,0.036996248540727736,-0.06955278113756864,-0.01079646656376082,0.3621771915730479,-2.743177359364513E-4,0.007851035585035351,0.3193388597673908,-3.291254802312856E-4,-0.10651162881393614,0.021750061792086894,0.010583293962789278,0.008927052071881512,0.13962669309306233,-0.10365205599551441,0.01477326092304517,-0.0037852582431252874,0.06987632727304785,0.15567565800907351,0.07522761177588182,-0.022929448465753417,-0.01758505810163322,-0.08491254579646909,0.010724517052044951,0.055545424076854165,0.010554595187646043,-0.2172502866680673,-0.029686601973574676,-0.025807131269348735,0.1819984360128851,0.2614267309174949,0.0276356795602747,0.0642674473327141,0.08203040586815029,0.14650933277665054,-0.05267330648524461,-0.052651125083219695,-0.01107591023230105,-2.618956771705393E-4,0.03262492131083826,0.016382346771861914,-0.25919680726510386,-0.1667155526051686,0.03426721681180728,-0.0020506708441145475,0.03060705429730592,-0.21057552655388986,-0.06358314615917321,-0.043805909669548795,-0.00439044334193323,-0.2604662502002124,-0.127929366392986,0.07024846452551202,0.448910057762284,0.20757941177534006,-0.225274275458969,0.011309932753357373,0.17332355177317338,-0.09780962507285408,0.002088679919267377,0.014196102817321761,-0.09846052924420393,-0.02555889133393159,0.0012542415118905303,-0.06220949182101123,0.013841489083358437,0.2329974674398397,0.29776584771137327,0.013191527731083234,0.353566105644652,-0.009541096201994104,0.09145370263401817,0.0850074279002022,0.09386540460070794,0.07193130478426633,0.017239314532745756,0.03167316796389211,-0.08900204772926781,-0.044217502873343056,0.07388769971672034,-0.017874458812278483,0.1474274518845184,0.38110858027201433,-0.10484494758013338,-0.02889226983655386,-0.04633017778560804,-0.008347394058690854,-0.5661025855725924,0.09747289479196403,-0.05599508041222996,0.023834353855159224,0.2890846993488378,0.1068786035436726,0.24833977408008145,0.02729167145228841,0.054490417111656426,0.0638635398648012,0.07441263232047084,0.2647662253787961,-0.016860807747511676,-0.10546327561506977,0.15980837913670826,-0.0617498407517082,0.3118663798792662,0.12607763840021757,0.033981520474951316,-0.04462105777583803,-0.11825210210268543,-0.11485049051795505,0.034870744720146146,-0.07716123908734228,0.0420465237423634,0.32425409335347455,0.12206344590280016,-0.20122657114610004,-0.0638300884839588,7.066510881861271E-4,0.017417311201314175,-0.02560443503828563,-1.1857533047141897,0.009027057224591689,0.007373128657035218,0.03574464855666366,-0.04271250639822131,-0.009471908794280628,0.10768080266900427,-0.09232084099617777,-0.14014855431920417,0.10283787512480665,0.10266499923983503,-0.20257856779125089,0.058348900985323846,-0.014318606078456779,0.0019124481705906494,0.020144093783032457,0.006963523977684105,0.10127369897249715,0.07156477612470849,-0.03327636960580424,-0.2505061562978773,-0.19367351956061682,-0.08311696029197341,-0.0103638716936667,-0.033441691701607566,-0.20862508567538787,-0.16545281262838324,0.22207008689065885,-0.006606798955840327,-0.06861536590380733,0.2516364746624743,0.3294375359524523,0.07509054728011562,-0.08456501917722906,0.020723638285420613,0.1296422392580621,0.1897488468917356,0.12815113599867287,-0.07857516799662263,0.05108106611177463,0.20191122327108474,-0.27452357077967043,0.15697332568458577,-0.3971962412963754,0.005683245361457271,0.05954645307325444,0.06037483077203268,-0.18944665217289836,0.03664525339429512,0.06163685750182523,0.07313183729670711,0.08509788127612908,-0.12026424557145121,0.027184131728304903,0.04419741587654979,0.07054328672261392,-0.04523191964282312,0.11649538999923156,0.10510845222193706,0.28781440729452046,-0.33074858742374735,-0.40989865143315946,-0.2547442052358463,-0.08564799037772251,-0.07181458517938012,-0.3008120036489906,-0.0258775202511998,0.03596608228635508,-0.14416812031585227,0.009974464309230519,-0.029149976893346932,-0.20762514046743513,0.20569618249174648,-0.021854025433777523,-0.2010218887558967,-0.07096196617940395,-0.08760958033196024,-0.10497152624129336,-0.20784956734534982,-0.39071982342239575,-0.034695559988571156,-0.01452642894968162,0.023759945023413123,-0.22108312802806576,-0.26618350735050067,0.05108440375312121,-0.15660669456572457,0.16859527245198352,-0.4841275900094421,-0.13534278121798274,0.05518891150470441,0.0074911986711242,-0.3266448358810675,0.032070180246184715,0.08711626071712603,-0.4111218802437864,-0.21776006306774043,0.23306349412442473,0.19672300787087452,-0.07097458798371453,0.052503324220379066,-0.19998304993041524,-1.3427561053936439,0.02960963472872426,-0.06773245479013018,-0.25917757346671444,-0.07901452200352274,0.44669662556976863,0.08730819492577213,0.06085696264831825,0.01713215183228135,0.16707751019820927,0.2202790448529521,-0.19644507393912883,0.1466470590382656,0.004904645335240732,-0.005294342614943322,-0.1080457555862405,0.04888806516036325,0.24654226507619015,0.027761315385834227,-0.044640344934878395,-0.04688363041011623,-0.006546489152242936,0.2245133903528869,0.06256944668067713,0.12039805761683714,-0.2626014634696963,0.08507353842666471,-0.28761435800723445,0.03299970080162241,-0.10710650090842208,0.004929671519806099,-0.03646070849084495,-0.029087847466097258,-0.051444989589863405,0.025152220263224112,0.18184730511852032,0.2609022167169844,-0.06763226613129944,-0.08639165018927122,0.14716413941133324,0.08444112699146555,0.16005294809718815,0.07663942582327671,-0.08401621996248662,-0.2350302683213149,0.026122729091715886,0.01073138878040266,0.22527635242517582,-0.14467979968729247,-0.05768198659415408,-0.007617710455676858,0.015804074772199368,0.04450305328612271,-0.11458584207613741,0.22832465182018308,-0.009603560878693256,-0.3329662967139442,-0.452785377018214,0.08340314865868215,-0.1051905168379967,0.037119252578636484,0.02353459997099378,0.07608197435497747,-0.010835054334689134,-0.07880340550029144,-0.007817062736145625,-0.025875951216524314,-0.015189054784652831,0.32527430453356754,0.4130858638224248,0.02516784320418428,-0.22155060823987394,-0.05886165029397526,-0.15259399459134695,-0.13202508909636823,0.24916357461892896,0.0825567851120216,-0.2377791668172787,-0.2003189810546926,0.433433495474347,-0.2882548007429193,0.2612662890699596,-0.04609871814394415,-0.1852757044333741,0.3498671954930226,0.05616513498861448,-0.021057971893756824,0.37092995334481793,0.1914206917559248,-0.1451403887974848,-0.01615273704047327,-0.12979536744937395,-0.14424171108379122,0.015092936365950318,-0.7287591390235201,-0.18966285028194627,0.11022330941196191,0.04152394761802813,-0.16350796592750208,-0.20411348141367747,0.09385116553317911,-0.03879349854302088,-0.11704230618618944,-0.0413691616957561,0.09910471754457474,0.9577079595234373,0.10220877744470136,0.03367965538895365,-0.406098552463753,0.2058458503761234,0.06622135621845113,-0.18894341744677082,-0.353547398055727,-0.0784372232032573,-0.17258913059383763,0.033251768736353736,0.13396408852178643,-0.11914179416090748,0.03327950158685594,-0.1140615989499897,-0.11074798272242392,0.02773208474092115,0.051323091395562175,0.07235269932491424,0.19850580980302973,0.022517975011044612,0.11645732987413676,-0.02941907015368262,-0.11694771478833975,-0.016139292995879833,0.24828983201032484,0.21708116814220701,0.008151825036316795,-0.17284931646921345,0.24196810724032392,-0.1893927529331041,0.16833245102329122,0.030054161748673622,0.06538915131048328,-0.10535416696348965,0.067396028964701,-0.2606621147368441,0.250787924451406,-0.05577835626808882,0.3150929609608179,0.6166255173571334,-0.06388559119138172,-0.3250054228741545,-0.00727824728901045,0.19348156968637856,0.011919267023992456,-0.4405046548838666,-0.024039389185371983,0.0072358254204653435,0.13358905360032006,0.21644789167535253,-1.0714540138464559,0.00818510699342998,0.06362349018243649,0.19045446671634458,0.08717923012595478,0.06551579784738885,0.2567778985125458,0.28452990318330673,0.5083321376661271,0.011793797652858979,0.03621903826771654,-0.08199325156977319,-0.2860951713864768,0.05514569008003896,0.03408481793003461,0.03736627404693363,0.11233980848493903,0.04107141423258808,0.1927428022235319,-0.14712997067126687,-0.1849953862045844,-0.09549284481062217,0.029058393169696287,0.13091775607047884,0.10706201673849536,0.07687651414271678,0.014194386199028047,0.08339891869746925,-0.5286035000360507,9.880782682965001E-4,-0.02053026217698337,0.0032374815631698367,-0.16547433539194803,-0.21040423666030775,0.08151294719078254,0.1213479155327189,0.0783917106126085,0.03501959513685232,0.024848861942613508,-9.204261592701381E-4,-0.07077791934028478,-0.025115144550277486,-0.07307021266421812,0.23512819984426372,0.02607157265902752,0.16864538517325145,-0.0017369509882711452,0.49468870735611137,-0.004825724957370446,0.28291956384714034,0.013901593955552296,-0.37634510995291925,0.20421410109401356,-0.12499960554479736,0.08029123605129211,-0.38574540201009266,0.16658492439807562,-0.26682054698374463,-0.14444368400820867,-0.1308647744002187,0.2879368048163743,0.061352979866106515,-0.06264669548769744,-0.07377329304775036,-0.012682627346975893,0.10243790869654712,-0.10504576941374896,0.07414710055766538,-0.22581074365925793,-0.16897897904037587,0.030879913277120336,0.009333037675270293,0.027345327480608788,-0.09211738801636786,0.054809094754116154,-0.054466472389137585,0.03894994685393105,0.00546518495953683,0.07992069710284125,-0.03300277578911943,5.370752218161322E-4,0.1083058042463735,-0.13010947367985248,0.07465391783780356,0.14040704455987965,0.029213694020558907,-0.03798015818105866,0.09466297116031977,-0.035800206587256946,-0.22198714340500236,-0.004735730075937432,-0.33549415495578017,-0.006901522743664981,0.13393533233518767,-0.016939076221602435,0.04547308077641902,0.21174049253101218,-0.2658515845575579,-0.26443543244812984,0.009891336345792506,-0.08517147187721516,-0.24316817570168667,-0.042356058909703904,-0.03510560099705523,-0.08654739038495823,-0.17364429554030172,0.12076456339359248,-0.9895045917645949,-0.519518818265075,0.13750900356898346,-0.1055772847957687,-9.173063779566928E-5,-0.1566397643187746,-0.1668381000012139,-0.10698549375723201,0.13463135104807553,-0.15158232207955286,-0.22395580718964703,-0.2645045769626922,-0.04623201492475496,0.014957150354913372,-0.1404305180246306,-0.08743704455399176,-0.03600304898013599,0.04315181672504319,0.02471657426751092,0.16420810460519566,-0.03782829940351055,-7.768980193197739E-4,-0.2717420514220203,-0.25305230477880847,-0.07745342006611168,0.3395004045857028,-0.15063339127642134,0.22673729815571317,0.24094451247350326,0.17687259223048754,0.14632462366906038,-0.17132532066973927,-0.05910411228030726,0.03611912584203413,0.0,0.11529077116793655,0.03955472355939436,-0.410170377880901,0.2419507903216949,0.44198676654844876,-0.08295790535434129,-0.09176609478472303,-0.16824994380648936,0.22016613217926578,-0.1124346000928356,0.21002701764063642,0.02970330979944329,0.08329298786124238,0.17638519516313866,0.1718164427681746,0.7931910303660904,-0.08131537013792639,0.391328200165776,0.007840024513509677,-0.47864133451196794,-0.2611701655111106,-0.1375445368813282,0.07672631718147167,0.2041558475961414,0.27609254051336773,-0.11842310470099732,0.1386785734236796,-0.20479225277666085,0.0,-0.126681519577465,-0.1370626122240839,-0.5246128577652606,0.23520354490570858,-0.017645401657871384,0.12420097272630619,0.18903117489713298,0.014321266927565352,-0.15959372172042013,0.43593274786801484,0.13602543528754604,0.3130214289007992,0.3763142645363248,0.03801011935214172,0.028610954339507506,0.041845206162713325,-0.025400101471028294,-0.007897212231553678,-0.8544694005588663,-0.16302190453209575,-0.1882471143015495,-0.1510592173307673,0.01820917553644239,0.13339547143080294,0.018292042663325677,0.01939741396009201,0.05654446261886452,-0.08310905855425256,0.19681347456953915,0.21758964299659883,-0.14780057260942991,-0.13040717326985365,-0.01661110203445071,0.05985878290180758,-0.11716416312908473,0.16258640054222046,0.22923844431742688,0.2197002047288547,-0.17620162418597746,1.1192561909457055E-4,-0.029177560982253697,0.16293574248267662,-0.2815681669662887,-0.10401085128487667,0.15739602120553495,0.12825924171821446,-0.03964256586483665,0.10930930707508467,0.2522494948713079,0.3544174676635065,-0.07902490664209676,0.39989304110145374,0.014087608297287658,-0.1567771308692553,7.656370990815704E-4,0.10395235369115498,0.23818409918165687,0.08395795397719999,-0.24793775835292176,0.08778068458752059,0.3140356216689588,0.00959185742989623,-1.8291269454715258,0.07896939621488253,0.1014286348433463,-0.1455714220435705,0.21632131847580136,0.3059668207286459,0.0925389468572391,-0.035206956440603676,0.23092119770411218,0.20267137444426442,-0.1896411479734542,0.03846717746513375,-0.04984968180986215,-0.5051274588965653,0.02867229947288298,0.10182543164141343,0.022983402943444687,0.18687936591092924,0.07781059343073812,-0.2941148761344489,0.1872085998725009,0.38352784097765086,0.043845248225804025,0.24168205063809564,0.017389646955686597,-0.06395385879854114,-0.4016230961717689,0.17348480528953789,-0.021602739272348206,0.05164748205628561,-0.3607477985618229,0.10529495958041608,-0.00553840734910695,-0.2018241326200321,-0.6084388282165603,0.12325179468778669,0.05847172373529479,0.03310761803383419,-0.30108858476580524,0.20480793290910998,0.3155078705057796,0.018995573933939018,-0.32953255976460694,0.11054662125895365,0.12940056301106728,0.03240222864022777,-0.14847625987182544,0.19228494980335875,-0.1641759927098036,0.5551883233579423,-0.21298821965512144,-0.5232929941437822,-0.012160141159794491,-0.7385762654288249,0.3660686584765155,0.06790929005999954,-0.2244477828752629,-0.005086206280581079,-0.17952266525121383,0.08203743561973337,0.2819538296066217,0.0763997441355458,-0.027598881684423968,0.0168706520135222,0.007081886517398293,0.03874857814953121,0.09250864336748217,-0.2634359569647653,-0.1156967128149735,0.054172097480721194,2.0072293394022658E-4,-0.21348633115478088,-0.09065858732052932,0.013268141134179571,-0.04695258510027009,-0.017352228286960862,0.2115771376787579,-0.5980884676627706,-0.15857235964288274,-0.17508957216403043,-0.05714561823240262,0.0029589596851761005,0.5269706175147072,0.0034050464412268785,-0.1293986580835143,-0.22067665706866693,-0.12098806650492343,-0.22607306253855253,-0.15758326291880415,0.45208764827665604,0.034489433978540755,-0.0804369983184701,-0.0,-0.05044167445992998,-0.19864432987390007,-0.49211261915427706,-0.16199835254262596,0.10297415136062806,0.17770235225876363,0.2766642103650341,-0.20836008342701637,-0.1324842429101092,-0.11403196038196807,0.04569173074058599,0.011398352025316609,-0.04411769989879439,-0.5982457952806334,-0.18271563761008455,-0.15228656041345406,-0.02658803439923618,0.49562501488151756,-0.10437777557828627,-0.253599448418207,0.12148395107427329,-0.21322111568033997,-0.5487464718826985,0.1349765192351896,-0.4190327520385519,0.0031420195198647648,-0.055968194013343804,0.029299023830259827,0.08834167411774027,-0.10999203904493904,0.031429104505601796,0.20587980536529094,-0.0011778795784038506,0.12288821861699821,0.05219738595839677,-0.2279007485595229,-0.1272196780929097,-0.008598541137427652,-0.0,0.17140160420676193,0.011861304512092004,3.7484569429296937E-4,-0.18372823472772726,-0.08331634223122465,-0.09349972848327828,-0.3453576250404699,-0.05414389301466276,0.2652363834658268,-0.00423268437123013,-0.20833743566850987,0.004901354483133467,0.7041037341812644,0.2108870793122912,-0.05231821295817823,-0.0369809371220614,0.11139167548648976,-0.1639263565584903,-0.26277923445208917,-0.03803915726897747,0.11106237636087109,-0.08137252908361933,0.12497812170862353,0.15262026941809245,-0.07195639041130303,0.13845420228795366,-0.009722398990451183,0.2507213820634409,-0.0871722125029937,0.1973340731090472,-0.02084817515749821,0.07490571921347046,0.22219690270960538,0.24249724541351667,0.10403973515101052,0.16467126301610086,-0.14954921153938264,0.09448414799115852,-0.2100199066782171,0.16806955053535438,0.26185587891138123,-0.3199069867248025,0.08826044778116077,0.11222808303229505,0.03541236178545801,-0.04914618801763794,-0.1946453767880623,0.06122124506961385,0.006240301143697939,0.05329196450835062,0.2694043489506588,-0.12992260668906266,-0.01566725465750848,0.004587871904372941,0.5722250763763124,-0.24758334556341566,0.12054899320144236,-0.3257711219294549,0.13985540633134852,-0.08348099178078934,-0.1904714736296216,-1.4232104863197035,-0.3337751192600923,0.31221283897988616,0.09087278755765381,-0.17090297456617673,0.05594181308801297,-0.7429289060838672,-0.11280579716498423,-0.2754138848429566,-0.11313726815328225,-0.18664057875496332,0.25653395517042266,-0.05780781117958425,-0.018791290003298492,-0.2724717713744001,0.14815188558306194,-0.011849632671947616,-0.02007597262866777,0.03316004418113677,0.003935869893310358,0.22916953480675453,-0.2220399770100427,0.006405521495059389,-0.014874965188453949,-0.036059177054606865,0.05007052725184661,-0.08186147366320587,0.347262079311529,1.275281418044729,0.021795262010366235,0.03435522746759299,0.04210372796775846,0.19653661040417755,0.3097341443142993,-0.16292706002462032,-0.18548907863071126,0.19546278228015956,-0.07676036450618841,0.04286036962985945,0.011879063711564856,-0.19114921364473306,0.056083176449972134,0.04936724801039006,-0.06938329238389163,-0.05601898860802597,-0.1378277853162839,0.011773560067224695,-0.5960229527324864,-0.2109799779955942,-0.8167341006506996,-0.21351149813984532,-0.14164092036268042,-0.01591862758120136,-0.3257481815453475,-0.27591345474378975,-0.19716536791463565,-0.0697670680679241,0.019974840783243932,-8.782521427105911E-4,0.2103098096699036,0.09385496464262802,6.995700948371129E-4,0.005919845118537739,-0.1525910601069795,0.10716295155250885,-0.10675786019692061,0.16269566075072908,0.09155822308390553,-3.75509359689971E-4,-0.9461026300482264,-0.0737856625656687,0.3079972487778821,0.12640560320213676,0.04545041785421722,0.11399856847887005,-0.23490664878537343,-0.26069725299395535,0.11917296863097948,0.3260794544991129,-0.021136582206428164,0.15166626542540748,-0.07208595540396531,-0.06940709934520907,0.02650597739706247,-0.07264936867607506,-0.2390080082957752,0.06576922863880631,-0.14495252753995028,0.18897166292650502,-0.15858439733101515,-0.8867740041086201,-0.4332321585772667,-0.055947405977332144,1.224376631679196,0.1390958441494023,-0.06473221497584718,0.03688162253074823,-0.028313451155932874,0.1671375984494249,-0.003701933332627877,0.017035182578662465,0.2860018938617784,-0.016437341422765293,-0.007033499332930463,0.16894088079814143,-0.010006786382191742,0.13167001684893168,0.1068284484745365,-0.14763045337900976,-0.22534353513580618,0.11906102689028278,0.12288506871853765,0.7858203987840622,-0.10505002747273692,-0.38013142545998185,0.06965788336215309,-0.027333827034073666,0.1130730980349264,0.014001081091824307,-0.08173561918095691,-0.059179123603287,0.2690931235049634,5.557920516461447E-4,-0.19236665176226142,-0.1441168923415119,0.026065488670488104,-0.1266030928319528,0.1057312477537649,0.002460387559144451,0.13625288185741194,0.027392011276688222,-0.20210141932522757,0.7037962259102557,-2.2314223680319945,0.08892591599992003,0.07903599336334557,0.058148234706372426,-0.05754563316540553,-0.2781368147030316,-0.1990627974003377,-0.19436419609762823,0.14092391759707718,0.05733973323990253] Intercept: -0.17585146047540784
>     lr: org.apache.spark.ml.classification.LogisticRegression = logreg_9fd90ec81535
>     lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid=logreg_9fd90ec81535, numClasses=2, numFeatures=1000

In [None]:
lrModel.binarySummary.objectiveHistory.foreach(loss => println(loss))

  

>     0.6931469164930032
>     0.6701329696175995
>     0.6069374384259247
>     0.5701182833435684
>     0.5442198132755232
>     0.5140857297233389
>     0.4856793988301197
>     0.46180537612389333
>     0.44480834485108484
>     0.4299379159589516
>     0.4186550933897052
>     0.41120769851478695
>     0.40507699682936316
>     0.3991728487176776
>     0.39489199083594595
>     0.39080491425469044
>     0.3869931347850945
>     0.38267360020601177
>     0.3801986290955363
>     0.3778293016421879
>     0.3756786021988705
>     0.3734079636686235
>     0.37012371744268885
>     0.3690326806138427
>     0.3677775541697602
>     0.3671254743371148
>     0.3665147351852915
>     0.36598766543382155
>     0.36552148713988714
>     0.36489674170117575
>     0.3642680588834924
>     0.3639094968425095
>     0.36356352360761807
>     0.36327750935764375
>     0.3629089490102315
>     0.36260859404530466
>     0.36229226989103425
>     0.3621272472404833
>     0.3618046028311709
>     0.3615942816156314
>     0.36121023670287755
>     0.3610369566545829
>     0.3608245320973927
>     0.36069542925054654
>     0.3605556136822391
>     0.3603799837387267
>     0.3602237172630807
>     0.36008234980954673
>     0.3599768248534264
>     0.3598706844036592
>     0.3597657926536273
>     0.35960685891902666
>     0.35950913074580076
>     0.3593783438514593
>     0.35932661057373777
>     0.35920615046518095
>     0.3591667981234418
>     0.3590252166136159
>     0.35897456562434876
>     0.3588855878916592
>     0.3588502207526231
>     0.35876953010750107
>     0.35872562461494656
>     0.35865890528569055
>     0.35860386570105623
>     0.3585524560445921
>     0.358505701620312
>     0.3584665738039789
>     0.358419419323597
>     0.358376374660276
>     0.35833428647513815
>     0.3582967610521753
>     0.3582587829803578
>     0.3582321100588377
>     0.358192907277209
>     0.358168689702564
>     0.3581298353252894
>     0.35810689571309373
>     0.3580739302151592
>     0.35805122464791833
>     0.35802296415487456
>     0.3579981219892494
>     0.3579701972082382
>     0.3579466634666583
>     0.3579193738118825
>     0.35790146260258143
>     0.35787976143107586
>     0.3578580462343845
>     0.3578362293364006
>     0.35781582119315325
>     0.3577959325988474
>     0.35777739193295144
>     0.3577622018592272
>     0.3577442821559652
>     0.35772954415792707
>     0.3577081107541279
>     0.3576922858210291
>     0.3576765410249369
>     0.3576659782767189
>     0.35765085840121164
>     0.3576379742016104

In [None]:
//Ugly code to lookup maximum and minimum values
var maxarray = Array.ofDim[Double](5,2)
def findmax(idx: Int, value: Double) = {
  if (value > maxarray(4)(1)){
    maxarray(4)(0) = idx
    maxarray(4)(1) = value
    maxarray = maxarray.sortBy(- _(1))
  }
}
var minarray = Array.ofDim[Double](5,2)
def findmin(idx: Int, value: Double) = {
  if (value < minarray(4)(1)){
    minarray(4)(0) = idx
    minarray(4)(1) = value
    minarray = minarray.sortBy(_(1))
  }
}

  

>     maxarray: Array[Array[Double]] = Array(Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0))
>     findmax: (idx: Int, value: Double)Unit
>     minarray: Array[Array[Double]] = Array(Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0), Array(0.0, 0.0))
>     findmin: (idx: Int, value: Double)Unit

In [None]:
lrModel.coefficients.foreachActive((idx, value) => findmax(idx, value))
println(maxarray.deep.foreach(println))
println(vectorizer.vocabulary(maxarray(0)(0).toInt))
println(vectorizer.vocabulary(maxarray(1)(0).toInt))
println(vectorizer.vocabulary(maxarray(2)(0).toInt))
println(vectorizer.vocabulary(maxarray(3)(0).toInt))
println(vectorizer.vocabulary(maxarray(4)(0).toInt))
lrModel.coefficients.foreachActive((idx, value) => findmin(idx, value))
println(minarray.deep.foreach(println))
println(vectorizer.vocabulary(minarray(0)(0).toInt))
println(vectorizer.vocabulary(minarray(1)(0).toInt))
println(vectorizer.vocabulary(minarray(2)(0).toInt))
println(vectorizer.vocabulary(minarray(3)(0).toInt))
println(vectorizer.vocabulary(minarray(4)(0).toInt))

  

>     Array(46.0, 1.8647655096872564)
>     Array(885.0, 1.275281418044729)
>     Array(950.0, 1.224376631679196)
>     Array(380.0, 0.9577079595234373)
>     Array(32.0, 0.8432880748567715)
>     ()
>     anonym
>     förlossningen
>     maken
>     sambon
>     sambo
>     Array(990.0, -2.2314223680319945)
>     Array(664.0, -1.8291269454715258)
>     Array(857.0, -1.4232104863197035)
>     Array(275.0, -1.3427561053936439)
>     Array(173.0, -1.1857533047141897)
>     ()
>     topic
>     bruden
>     vafan
>     brudar
>     jävligt

  

8. Predict on test data
-----------------------

In [None]:
val predictions = lrModel.transform(test)
predictions.orderBy(rand()).select("c", "prediction", "probability").show(30, false)


  

>     +---+----------+-------------------------------------------+
>     |c  |prediction|probability                                |
>     +---+----------+-------------------------------------------+
>     |0  |0.0       |[0.9721401017870042,0.027859898212995764]  |
>     |0  |0.0       |[0.975623998737009,0.02437600126299096]    |
>     |1  |1.0       |[4.6730789993111417E-7,0.9999995326921002] |
>     |0  |0.0       |[0.933249707175278,0.066750292824722]      |
>     |0  |0.0       |[0.9902085789245901,0.009791421075409966]  |
>     |0  |0.0       |[0.5279677569376853,0.47203224306231467]   |
>     |0  |0.0       |[0.9932461412304279,0.00675385876957205]   |
>     |1  |1.0       |[2.43269453308815E-5,0.9999756730546691]   |
>     |1  |1.0       |[8.266454051870882E-10,0.9999999991733546] |
>     |0  |0.0       |[0.9997151003194746,2.8489968052548283E-4] |
>     |1  |0.0       |[0.5514931570249911,0.44850684297500887]   |
>     |0  |0.0       |[0.5858664716477586,0.41413352835224143]   |
>     |1  |1.0       |[0.002100566198113697,0.9978994338018863]  |
>     |1  |1.0       |[0.07917634407205193,0.920823655927948]    |
>     |0  |0.0       |[0.9970675008521647,0.0029324991478353007] |
>     |0  |0.0       |[0.9999595461915014,4.045380849869759E-5]  |
>     |0  |1.0       |[0.33337692071405434,0.6666230792859457]   |
>     |1  |1.0       |[0.36761800025826114,0.6323819997417389]   |
>     |1  |1.0       |[0.3245585295503879,0.6754414704496121]    |
>     |1  |1.0       |[0.2355899833856519,0.7644100166143482]    |
>     |0  |0.0       |[0.9999999999997755,2.2452150253864004E-13]|
>     |1  |1.0       |[0.18608690603389255,0.8139130939661074]   |
>     |0  |0.0       |[0.740890026139782,0.25910997386021795]    |
>     |0  |0.0       |[0.963586227883629,0.036413772116371014]   |
>     |1  |1.0       |[0.0021508873399861557,0.9978491126600139] |
>     |1  |1.0       |[0.3858439417926455,0.6141560582073544]    |
>     |1  |1.0       |[0.4517753939274335,0.5482246060725665]    |
>     |1  |1.0       |[0.02573645474447229,0.9742635452555276]   |
>     |0  |0.0       |[0.8022052550237544,0.19779474497624555]   |
>     |1  |1.0       |[0.042382658471976975,0.9576173415280229]  |
>     +---+----------+-------------------------------------------+
>     only showing top 30 rows
>
>     predictions: org.apache.spark.sql.DataFrame = [c: int, features: vector ... 3 more fields]

In [None]:
val evaluator = new BinaryClassificationEvaluator().setLabelCol("c")
evaluator.evaluate(predictions)

  

>     evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = BinaryClassificationEvaluator: uid=binEval_0812f13ed2be, metricName=areaUnderROC, numBins=1000
>     res19: Double = 0.928445521562674