# Apache Spark with Scala

More details and magic commands here: https://github.com/apache/incubator-toree/blob/master/etc/examples/notebooks/magic-tutorial.ipynb


## Word Count problem

Defining a object for counting frequence of words from 2 files and printing the common words that appear more than 100 times on both files

In [3]:
import org.apache.spark._
import org.apache.spark.streaming._


class Analiser {

  // Args = path/to/text0.txt path/to/text1.txt
  def main(args: Array[String]) {

    // create Spark context with Spark configuration
    //não precisa, já que está em modo shell
    //val sc = new SparkContext(new SparkConf().setAppName("Contagem de Palavra"))
    //val ssc = new StreamingContext(sc, Seconds(1))

    val startTime = System.nanoTime()    
      
    println("TEXT1")

    // read first text file and split into lines
    val txt1 = sc.textFile(args(0))

    // contar palavras do texto 1 e imprimir as 5 palavras com as maiores ocorrencias (ordem DECRESCENTE)

    val wordsTxt1 = txt1.flatMap(line => line.split(" "))

    val wordCountTxt1 = wordsTxt1.map(word => (word.toLowerCase.replaceAll("[,.!?:;]",""), 1))
            .reduceByKey(_ + _)
            .filter(item => item._1.length > 3) // apenas palavras com mais de tres caracteres
            .map(item => item.swap) // esse trecho é necessário pois não há função sortByValue() para values, então inverteremos
            .sortByKey(false)
            .map(item => item.swap)

    val output1 = wordCountTxt1.take(5) // apenas as 5 mais frequentes

    output1.foreach(item => println(item._1 + "=" + item._2.toString))

    println("TEXT2")

    // read second text file and split each document into words
    val txt2 = sc.textFile(args(1))

    // transformações do texto 2 análogas ao texto 1

    val wordsTxt2 = txt2.flatMap(line => line.split(" "))

    val wordCountTxt2 = wordsTxt2.map(word => (word.toLowerCase.replaceAll("[,.!?:;]",""),1))
            .reduceByKey(_ + _) // list1(k1, v1) list2(k2,v2) -> if(k1.value==k2.value): merge k1 with k2 in (k3, v1+v2)
            .filter(item => item._1.length > 3) // apenas palavras com mais de tres caracteres
            .map(item => item.swap) // esse trecho é necessário pois não há função sortByValue() para values, então inverteremos
            .sortByKey(false)
            .map(item => item.swap)

    val output2 = wordCountTxt2.take(5)

    output2.foreach(item => println(item._1 + "=" + item._2.toString))

    // comparar resultado e imprimir na ordem ALFABETICA todas as palavras que aparecem MAIS que 100 vezes nos 2 textos

    println("COMMON")

    val filt1 = wordCountTxt1.filter(_._2 > 100).sortByKey().keys // palavras do texto 1
    val filt2 = wordCountTxt2.filter(_._2 > 100).sortByKey().keys // palavras do texto 2
      
    val common =  filt1.intersection(filt2) // intersecção

    common.collect().toList.sorted.foreach{ x => println(x)}
      
     val endTime = System.nanoTime()
     println("Time elapsed: " + (endTime-startTime)/100000000 + " seconds")

  }
}

## Now let's run the object ...

In [5]:
val files = Array("./resources/data/input1.txt", "./resources/data/input2.txt")

val myAnaliser = new Analiser

myAnaliser.main(files)

TEXT1
said=456
alice=377
that=234
with=172
very=139
TEXT2
vibrating=1
young=10
stumbled=8
intimately=1
someone=1
COMMON
little
said
that
they
this
with
Time elapsed: 8 seconds


## Desired Output:  
TEXT1  
said=456  
alice=377  
that=234  
with=172  
very=139  
TEXT2  
that=759  
with=448  
were=365  
from=326  
they=302  
COMMON  
little  
said  
that  
they  
this  
with  
