# ScalaSpark - Unstructured Text

### Read in text file

In [2]:
val bookText = sc.textFile("hdfs://localhost:54310/user/andrew/books/pg20417.txt")
bookText.take(5)

Array("The Project Gutenberg EBook of The Outline of Science, Vol. 1 (of 4), by ", J. Arthur Thomson, "", This eBook is for the use of anyone anywhere at no cost and with, almost no restrictions whatsoever.  You may copy it, give it away or)

### Generate summary statistics

In [8]:
//count number of lines
println(bookText.count(), "lines in book")

//count number of words
val wordCounts = bookText.flatMap(line=>line.split(" ")).map(word => (word,1)).reduceByKey((a,b)=>a+b)
println(wordCounts.map(_._2).sum())

//Longest line
println(bookText.map(line => line.split(" ").size).reduce((a,b)=> if (a>b) a else b))

(12760,lines in book)
124481.0
66


### Identify most popular words

In [10]:
val freqWords = wordCounts.sortBy(_._2, ascending=false) //rdd.sortBy(pair => pair._2)
freqWords.take(30)

Array(("",14637), (the,7906), (of,5425), (and,2759), (a,2422), (to,2168), (is,2068), (in,2048), (that,1273), (are,921), (it,912), (The,876), (which,793), (as,757), (be,663), (by,635), (on,631), (or,630), (we,601), (with,584), (from,571), (for,522), (have,472), (was,441), (there,434), (has,427), (an,409), (not,406), (It,402), (its,402))

### Other common word statistics

In [13]:
//Number of words that appear >1000 times
println(freqWords.filter(pair => pair._2>1000).count())

//Avg number of times a word appears
wordCounts.map(_._2).sum()/wordCounts.count()

9


6.900277161862528