# Chapter 25 - Preprocessing - Text Transformation

In [1]:
%ShowTypes on

Types will be printed.


# Setup

In [13]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vector

import spark.implicits._

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [3]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [4]:
val NUM_CORES = 4
val NUM_PARTITIONS = 4

lazy val spark: SparkSession = SparkSession.builder()
    .appName("mllib-cross-validation")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/

NUM_CORES: Int = 4
NUM_PARTITIONS: Int = 4
spark: org.apache.spark.sql.SparkSession = <lazy>


spark: org.apache.spark.sql.SparkSession = <lazy>


In [5]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,10.186.87.0)
(spark.eventLog.enabled,true)
(spark.driver.port,45915)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://10.186.87.0:45915/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-16dc876b-3e5b-4038-97a6-36eff616d985/repl-48be94f1-8be2-408f-b45b-117a9f280ab6)
(spark.app.name,mllib-cross-validation)
(spark.driver.memory,3g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,16)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,yarn)
(spark.ui.filters,org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.driver.appUIAddress,http://10.186.87.0:4040)
(spark.org.a

configMap: Unit = ()


## Constants

In [6]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data"
val RESULT_DIR="."

PROTOCOL: String = file://
DATA_DIR: String = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data
RESULT_DIR: String = .


RESULT_DIR: String = .


# Dataframe

In [7]:
val sales = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(PROTOCOL + DATA_DIR + "/retail-data/by-day/*.csv")
  .coalesce(5)
  .where("Description IS NOT NULL")
val fakeIntDF = spark.read.parquet(PROTOCOL + DATA_DIR + "/simple-ml-integers")
var simpleDF = spark.read.json(PROTOCOL + DATA_DIR + "/simple-ml")
val scaleDF = spark.read.parquet(PROTOCOL + DATA_DIR + "/simple-ml-scaling")

sales: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 6 more fields]
fakeIntDF: org.apache.spark.sql.DataFrame = [int1: int, int2: int ... 1 more field]
simpleDF: org.apache.spark.sql.DataFrame = [color: string, lab: string ... 2 more fields]
scaleDF: org.apache.spark.sql.DataFrame = [id: int, features: vector]


scaleDF: org.apache.spark.sql.DataFrame = [id: int, features: vector]


In [8]:
sales.cache()
sales.printSchema
sales.show()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538| 

# Text Transformer

In [17]:
val tokenizer = new Tokenizer()
    .setInputCol("Description")
    .setOutputCol("Tokens")
val tokenDF = tokenizer.transform(sales)
    .select("Description", "Tokens")

tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_b443228f5ce9
tokenDF: org.apache.spark.sql.DataFrame = [Description: string, Tokens: array<string>]


tokenDF: org.apache.spark.sql.DataFrame = [Description: string, Tokens: array<string>]


In [19]:
tokenDF.show(5, false)

+-------------------------------+-------------------------------------+
|Description                    |Tokens                               |
+-------------------------------+-------------------------------------+
|RABBIT NIGHT LIGHT             |[rabbit, night, light]               |
|DOUGHNUT LIP GLOSS             |[doughnut, lip, gloss]               |
|12 MESSAGE CARDS WITH ENVELOPES|[12, message, cards, with, envelopes]|
|BLUE HARMONICA IN BOX          |[blue, harmonica, in, box]           |
|GUMBALL COAT RACK              |[gumball, coat, rack]                |
+-------------------------------+-------------------------------------+
only showing top 5 rows



# NGram

In [23]:
val trigram = new NGram().setInputCol("Tokens").setN(3)
trigram.transform(tokenDF.select("Tokens")).show(false)

+------------------------------------------+---------------------------------------------------------------------------------------+
|Tokens                                    |ngram_9dfb0a4f628d__output                                                             |
+------------------------------------------+---------------------------------------------------------------------------------------+
|[rabbit, night, light]                    |[rabbit night light]                                                                   |
|[doughnut, lip, gloss]                    |[doughnut lip gloss]                                                                   |
|[12, message, cards, with, envelopes]     |[12 message cards, message cards with, cards with envelopes]                           |
|[blue, harmonica, in, box]                |[blue harmonica in, harmonica in box]                                                  |
|[gumball, coat, rack]                     |[gumball coat rack]      

trigram: org.apache.spark.ml.feature.NGram = ngram_9dfb0a4f628d


trigram: org.apache.spark.ml.feature.NGram = ngram_9dfb0a4f628d


# Frequency of word appearance
* Term: Word
* Document: in which term appears
* Vocabrary: collection of terms

## Term Freququency (TF)
Number of times a term appeared in all documents

## Document Frequencey (DF)

Number of documents in which a term appeared

In [33]:
val cv = new CountVectorizer()
  .setInputCol("Tokens")
  .setOutputCol("Frequencies")
  .setVocabSize(5000)
  .setMinTF(1)
  .setMinDF(2)
val fittedCV = cv.fit(tokenDF.select("Tokens"))
fittedCV.transform(tokenDF.select("Tokens")).show(false)

+------------------------------------------+----------------------------------------------------------------+
|Tokens                                    |Frequencies                                                     |
+------------------------------------------+----------------------------------------------------------------+
|[rabbit, night, light]                    |(2374,[149,185,212],[1.0,1.0,1.0])                              |
|[doughnut, lip, gloss]                    |(2374,[462,463,491],[1.0,1.0,1.0])                              |
|[12, message, cards, with, envelopes]     |(2374,[35,41,166,782,942],[1.0,1.0,1.0,1.0,1.0])                |
|[blue, harmonica, in, box]                |(2374,[10,16,36,352],[1.0,1.0,1.0,1.0])                         |
|[gumball, coat, rack]                     |(2374,[228,280,407],[1.0,1.0,1.0])                              |
|[skulls, , water, transfer, tattoos]      |(2374,[11,40,133,1169,1170],[1.0,1.0,1.0,1.0,1.0])              |
|[feltcraf

cv: org.apache.spark.ml.feature.CountVectorizer = cntVec_94969b77d336
fittedCV: org.apache.spark.ml.feature.CountVectorizerModel = cntVec_94969b77d336


fittedCV: org.apache.spark.ml.feature.CountVectorizerModel = cntVec_94969b77d336
