In [2]:
val path = "20news-bydate-train/*"
val rdd = sc.wholeTextFiles(path)
val text = rdd.map{ case (file, text) => text }
print(text.count)

11314

In [3]:
val newsgroups = rdd.map{ case (file, text) =>
    file.split("/").takeRight(2).head
}
val countByGroup = newsgroups.map(n => (n, 1)).reduceByKey(_+_).collect.sortBy(-_._2).mkString("\n")
println(countByGroup)

(rec.sport.hockey,600)
(soc.religion.christian,599)
(rec.motorcycles,598)
(rec.sport.baseball,597)
(sci.crypt,595)
(rec.autos,594)
(sci.med,594)
(comp.windows.x,593)
(sci.space,593)
(sci.electronics,591)
(comp.os.ms-windows.misc,591)
(comp.sys.ibm.pc.hardware,590)
(misc.forsale,585)
(comp.graphics,584)
(comp.sys.mac.hardware,578)
(talk.politics.mideast,564)
(talk.politics.guns,546)
(alt.atheism,480)
(talk.politics.misc,465)
(talk.religion.misc,377)


In [6]:
val text = rdd.map { case (file, text) => text }
val whiteSpaceSplit = text.flatMap(t => t.split(" ").map(_.toLowerCase))
println(whiteSpaceSplit.distinct.count)

402978


In [7]:
println(whiteSpaceSplit.sample(true, 0.3, 42).take(100).mkString(","))

addresses,,--,atheism,,cambridge.,<19930301143317@mantis.co.uk>
lines:,290

archive-name:,december,1.0

,,,,,,,,,,,of,,,,,,,foundation

darwin,assorted,atheist,are
available,to:,(608),the,"darwin,fish".,it's,it's,symbol,,symbol,,like,like,stick,on,cars,,but,but,and,the,is,in,the,the,the,#4,,hollywood,
,,san,darwin,gold,for,net,who,lynn,directly,,the
price,the
price,fish.

american,books,critiques,critiques,the,bible,,of
biblical,contradictions,,contradictions,,,and,american,0-910309-26-4,,edition,,atrocities,,contains,foote:,bible
contradicts,aap.,king,american,,(512),,books

sell,haught's,horrors",(see,,york,york,newer,is:
prometheus,is:
prometheus,14228-2197.

african-americans


In [8]:
val nonWordSplit = text.flatMap(t => t.split("""\W+""").map(_.toLowerCase))
println(nonWordSplit.distinct.count)

130126


In [9]:
println(nonWordSplit.distinct.sample(true, 0.3, 42).take(100).mkString(","))

wuair,schwabam,42b,125215,6j,he3,1pqd9hinnbmi,neurologists,jxicaijp,dwi,749,steaminess,dangers,qsins,instantaneous,391k,typeset,typeset,bippy,hollombe,mswin,diccon,4h0kj76,borg,g85,spe,kocharian,6097,1tbs,xs9,3zur,unaskable,9mf,cj1v,bowdoin,bowdoin,inre,inre,deadweight,deadweight,deterministic,createwindow,rockefeller,kjznkh,kjznkh,classifieds,ray_bourque,anachronistic,cherylm,005117,005117,005117,interfere,makewindow,mtearle,barking,ww2,vcrs,widmann,monger,projector,jdecarlo,warms,triangulate,triangulate,recieves,g45,rint69,rint69,herod,1496,libpackagexcl,6w8rg,6w8rg,00ecgillespie,phoniest,funded,canonical,ehs,birds,dxb132,xtappcontext,0iy4bn,lamers,023843,inconsitancies,isdres,trn,xa_rgb_default_map,dm9,rchzd2_8d,mtagm,walters,r1865,9gtf,9gtf,lfu1i9b,tyrell,tyrell,rvik


In [11]:
val regex = """[^0-9]*""".r
val filterNumbers = nonWordSplit.filter(token => regex.pattern.matcher(token).matches)
println(filterNumbers.distinct.count)

84912


In [12]:
println(filterNumbers.distinct.sample(true, 0.3, 42).take(100).mkString(","))

ntuvax,dpsi,singen,leymarie,_congressional,fowl,rlhzrlhz,afterward,ignore,hcq,beleive,goofed,arax,dfuller,nondiscriminatory,steaminess,urtfi,urtfi,za_,tiems,bellevue,typeset,armegedon,gunning,croissant,yearsley,dolphin,tic,worshippers,theoreticians,siumv,arresed,borg,sunprops,sask,sask,subcircuits,subcircuits,uninjured,uninjured,internship,pws,keysym,vfj,vfj,connecters,spe,octopi,bhjn,winsor,winsor,winsor,yan,astonished,miserable,eng,subtleties,createwindow,silvers,explorers,antisemites,classifieds,ray_bourque,inviting,inviting,apply,cfsmo,holdren,holdren,mishandles,feszcm,rootx,scramblers,scramblers,nkm,hfd,makewindow,formac,exhausting,responsbible,paradijs,fuenfzig,hindenburg,trial,tact,fahrenheit,projector,jdecarlo,ndallen,recoend,ffbv,bracing,wy,herod,sonunda,sonunda,depicted,iauc,iauc,floor


In [13]:
val tokenCounts = filterNumbers.map(t => (t, 1)).reduceByKey(_+_)
val orderingDesc = Ordering.by[(String, Int), Int](_._2)
println(tokenCounts.top(20)(orderingDesc).mkString("\n"))

(the,146532)
(to,75064)
(of,69034)
(a,64195)
(ax,62406)
(and,57957)
(i,53036)
(in,49402)
(is,43480)
(that,39264)
(it,33638)
(for,28600)
(you,26682)
(from,22670)
(s,22337)
(edu,21321)
(on,20493)
(this,20121)
(be,19285)
(t,18728)


In [31]:
val stopwords = Set("the", "a", "an", "of", "or", "in", "for", "by", "on", "but", "is", "not", "with", "as", "was", "if", "they", "are", "this", "and", "it", "have", "from", "at", "my", "be", "that", "to")
val tokenCountsFilteredStopwords = tokenCounts.filter{ case (k, v) => !stopwords.contains(k) }
println(tokenCountsFilteredStopwords.top(20)(orderingDesc).mkString("\n"))

(ax,62406)
(i,53036)
(you,26682)
(s,22337)
(edu,21321)
(t,18728)
(m,12756)
(subject,12264)
(com,12133)
(lines,11835)
(can,11355)
(organization,11233)
(re,10534)
(what,9861)
(there,9689)
(x,9332)
(all,9310)
(will,9279)
(we,9227)
(one,9008)


In [23]:
val tokenCountsFilteredSize = tokenCountsFilteredStopwords.filter{ case(k, v) => k.size >= 2}
println(tokenCountsFilteredSize.top(20)(orderingDesc).mkString("\n"))

(ax,62406)
(you,26682)
(edu,21321)
(or,14686)
(subject,12264)
(com,12133)
(lines,11835)
(can,11355)
(organization,11233)
(re,10534)
(what,9861)
(there,9689)
(all,9310)
(will,9279)
(we,9227)
(one,9008)
(would,8905)
(do,8674)
(he,8441)
(about,8336)


In [24]:
val orderingAsc = Ordering.by[(String, Int), Int](-_._2)
println(tokenCountsFilteredSize.top(20)(orderingAsc).mkString("\n"))

(hcq,1)
(rohde,1)
(_slightly_,1)
(wuair,1)
(mowtu,1)
(bruns,1)
(luminous,1)
(beckmans,1)
(arax,1)
(fowl,1)
(jxicaijp,1)
(rlhzrlhz,1)
(aces,1)
(steaminess,1)
(wargame,1)
(qsins,1)
(schwabam,1)
(urtfi,1)
(_congressional,1)
(costner,1)


In [25]:
val rareTokens = tokenCounts.filter{ case (k,v) => v < 2 }.map{ case (k, v) => k }.collect.toSet
val tokenCountsFilteredAll = tokenCountsFilteredSize.filter { case (k, v) => !rareTokens.contains(k) }
println(tokenCountsFilteredAll.top(20)(orderingAsc).mkString("\n"))

(sina,2)
(akachhy,2)
(mvd,2)
(sarkis,2)
(wendel_clark,2)
(relieves,2)
(purposeful,2)
(hizbolah,2)
(wout,2)
(uneven,2)
(senna,2)
(subdivided,2)
(bushy,2)
(feagans,2)
(coretest,2)
(oww,2)
(historicity,2)
(mmg,2)
(margitan,2)
(defiance,2)


In [27]:
println(tokenCountsFilteredAll.count)

51802


In [32]:
def tokenize(line: String): Seq[String] = {
    line.split("""\W+""")
    .map(_.toLowerCase)
    .filter(token => regex.pattern.matcher(token).matches)
    .filterNot(token => stopwords.contains(token))
    .filterNot(token => rareTokens.contains(token))
    .filter(token => token.size >= 2)
    .toSeq
}

println(text.flatMap(doc => tokenize(doc)).distinct.count)

51801


In [33]:
val tokens = text.map(doc => tokenize(doc))
println(tokens.first.take(20))

WrappedArray(mathew, mathew, mantis, co, uk, subject, alt, atheism, faq, atheist, resources, summary, books, addresses, music, anything, related, atheism, keywords, faq)


In [36]:
import org.apache.spark.mllib.linalg.{ SparseVector => SV }
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF

val dim = math.pow(2, 18).toInt
val hashingTF = new HashingTF(dim)
val tf = hashingTF.transform(tokens)
tf.cache

MapPartitionsRDD[60] at map at HashingTF.scala:76

In [37]:
val v = tf.first.asInstanceOf[SV]
println(v.size)
println(v.values.size)
println(v.values.take(10).toSeq)
println(v.indices.take(10).toSeq)

262144
706
WrappedArray(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0)
WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)


In [38]:
val idf = new IDF().fit(tf)
val tfidf = idf.transform(tf)
val v2 = tfidf.first.asInstanceOf[SV]
println(v2.values.size)
println(v2.values.take(10).toSeq)
println(v2.indices.take(10).toSeq)

706
WrappedArray(2.3869085659322193, 4.670445463955571, 6.561295835827856, 4.597686109673142, 8.932700215224111, 5.750365619611528, 2.1871123786150006, 5.520408782213984, 3.4312512246662714, 1.7430324343790569)
WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)


In [40]:
val minMaxVals = tfidf.map { v =>
    val sv = v.asInstanceOf[SV]
    (sv.values.min, sv.values.max)
}
val globalMinMax = minMaxVals.reduce { case ((min1, max1), (min2, max2)) =>
    (math.min(min1, min2), math.max(max1, max2))
}
println(globalMinMax)

(0.0,66155.39470409753)


In [41]:
val common = sc.parallelize(Seq(Seq("you", "do", "we")))
val tfCommon = hashingTF.transform(common)
val tfidfCommon = idf.transform(tfCommon)
val commonVector = tfidfCommon.first.asInstanceOf[SV]
println(commonVector.values.toSeq)

WrappedArray(0.9965359935704624, 1.3348773448236835, 0.5457486182039175)


In [42]:
val uncommon = sc.parallelize(Seq(Seq("telescope", "legislation", "investment")))
val tfUncommon = hashingTF.transform(uncommon)
val tfidfUncommon = idf.transform(tfUncommon)
val uncommonVector = tfidfUncommon.first.asInstanceOf[SV]
println(uncommonVector.values.toSeq)

WrappedArray(5.3265513728351666, 5.308532867332488, 5.483736956357579)


In [48]:
val hockeyText = rdd.filter{ case (file, text) => file.contains("hockey")}
val hockeyTF = hockeyText.mapValues(doc => hashingTF.transform(tokenize(doc)))
val hockeyTfIdf = idf.transform(hockeyTF.map(_._2))

In [53]:
//Compute cosine similarity of two document vectors that belong to the same category
import breeze.linalg._
val hockey1 = hockeyTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
val breeze1 = new SparseVector(hockey1.indices, hockey1.values, hockey1.size)
val hockey2 = hockeyTfIdf.sample(true, 0.1, 43).first.asInstanceOf[SV]
val breeze2 = new SparseVector(hockey2.indices, hockey2.values, hockey2.size)
val cosineSim = breeze1.dot(breeze2) / (norm(breeze1) * norm(breeze2))
println(cosineSim)

0.08332038223731995


In [54]:
//Compute cosine similarity of two documents that belong to different categories
val graphicsText = rdd.filter { case (file, text) => file.contains("comp.graphics")}
val graphicsTF = graphicsText.mapValues(doc => hashingTF.transform(tokenize(doc)))
val graphicsTfIdf = idf.transform(graphicsTF.map(_._2))
val graphics = graphicsTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
val breezeGraphics = new SparseVector(graphics.indices, graphics.values, graphics.size)
val cosineSim2 = breeze1.dot(breezeGraphics) / (norm(breeze1) * norm(breezeGraphics))
println(cosineSim2)

0.011982956191037503


In [55]:
val baseballText = rdd.filter { case (file, text) => file.contains("baseball")}
val baseballTF = baseballText.mapValues(doc => hashingTF.transform(tokenize(doc)))
val baseballTfIdf = idf.transform(baseballTF.map(_._2))
val baseball = baseballTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
val breezeBaseball = new SparseVector(baseball.indices, baseball.values, baseball.size)
val cosineSim3 = breeze1.dot(breezeBaseball) / (norm(breeze1) * norm(breezeBaseball))
println(cosineSim3)

0.013522460083035466


In [57]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics

val newsGroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
val zipped = newsgroups.zip(tfidf)
val train = zipped.map{ case (topic, vector) => LabeledPoint(newsGroupsMap(topic), vector)}
train.cache

MapPartitionsRDD[107] at map at <console>:88

In [58]:
//Train the model
val model = NaiveBayes.train(train, lambda = 0.1)

In [60]:
//Loading Test DataSet
val testPath = "20news-bydate-test/*"
val testRDD = sc.wholeTextFiles(testPath)
val testLabels = testRDD.map { case (file, text) =>
    val topic = file.split("/").takeRight(2).head
    newsGroupsMap(topic)
}

In [61]:
//Transforming Test DataSet
val testTf = testRDD.map { case (file, text) =>
    hashingTF.transform(tokenize(text))
}
val testTfIdf = idf.transform(testTf)
val zippedTest = testLabels.zip(testTfIdf)
val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector)}

In [62]:
//Evaluating Model
val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
val metrics = new MulticlassMetrics(predictionAndLabel)
println(accuracy)
println(metrics.weightedFMeasure)

0.7915560276155071
0.7810675969031116


In [66]:
//Raw Features
val rawTokens = rdd.map { case (file, text) => text.split(" ")}
val rawTF = rawTokens.map(doc => hashingTF.transform(doc))
val rawTrain = newsgroups.zip(rawTF).map{ case (topic, vector) => LabeledPoint(newsGroupsMap(topic), vector)}
val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" "))}
val rawZippedTest = testLabels.zip(rawTestTF)
val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector)}
val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label))
val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
println(rawAccuracy)
val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
println(rawMetrics.weightedFMeasure)

0.7648698884758365
0.7653320418573546


In [67]:
//Word2Vec
import org.apache.spark.mllib.feature.Word2Vec
val word2vec = new Word2Vec()
word2vec.setSeed(42)
val word2vecModel = word2vec.fit(tokens)

In [68]:
word2vecModel.findSynonyms("hockey", 20).foreach(println)

(glens,1.3159770807119657)
(ecac,1.3063226990531056)
(woofers,1.2244201648489768)
(sport,1.192043594806107)
(ahl,1.1814434726624985)
(roster,1.1619491231343353)
(hispanic,1.15425795039727)
(commissioner,1.1538891320841023)
(golf,1.1537870644739696)
(homeruns,1.1333313901547062)
(assistant,1.129485586835786)
(tournament,1.1223773487423643)
(playoff,1.1143282937310677)
(ncaa,1.1106336082384496)
(champs,1.101362200963179)
(rec,1.0981824063192052)
(captains,1.0975511191200764)
(surprises,1.094262378108031)
(motorcycles,1.0774580362567963)
(boxscores,1.0735494981958151)


In [70]:
word2vecModel.findSynonyms("legislation", 20).foreach(println)

(accommodates,0.9109466519258614)
(briefed,0.8575139231706556)
(amended,0.8537385855799084)
(agency,0.8405774651559488)
(advocate,0.8399973310548887)
(papers,0.8260221616946232)
(policies,0.8233629198783513)
(aclu,0.8207162878765116)
(procurement,0.8200093598026995)
(journals,0.8186656379374622)
(rkba,0.8181851162363095)
(officials,0.8090259489409931)
(cooperation,0.8046469700111981)
(director,0.8035926081300374)
(senate,0.8034863889546195)
(amendments,0.7998425571269364)
(layman,0.7994458023255351)
(privacy,0.7990481700027887)
(nren,0.7989127478009489)
(enact,0.7976532149877671)
