# Examen 2

_ 175904 - Jorge III Altamirano Astorga_


## Carga de los Datos

In [None]:
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
import time

In [None]:
# https://spark.apache.org/docs/latest/configuration.html
conf = SparkConf()
conf.set("spark.worker.cleanup.appDataTtl", 24*60*60)
conf.set("spark.worker.cleanup.enabled", True)
conf.set("spark.driver.memory", "60g")
conf.set("spark.driver.cores", 14)
conf.set("spark.driver.memoryOverhead", 0.9)
conf.set("spark.executor.memory", "60g")
conf.set("spark.executor.cores", 14)
conf.set("spark.jars", "file:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-nlp_2.11-1.5.0.jar")
conf.set("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.5.3")
sc = SparkContext(master = "spark://jupyter.corp.penoles.mx:7077", 
                  sparkHome="/usr/local/spark/",
                  appName="examen-ma-2", conf=conf)
spark = SQLContext(sc)
sc.version

In [None]:
sc.stop()

In [3]:
%%time 
schema_ingredientes = schema=StructType().\
    add("id", data_type=StringType(), nullable=False, metadata=None).\
    add("cuisine", data_type=StringType(), nullable=False, metadata=None).\
    add("ingredients", data_type=ArrayType(StringType()), nullable=True, metadata=None)
train = spark.read.json("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train.json", 
                        schema=schema_ingredientes,
                        allowUnquotedFieldNames=True,
                        multiLine=True)
print("Schema:")
train.printSchema()
print("Show:")
train.show(5)

Schema:
root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

Show:
+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|[romaine lettuce,...|
|25693|southern_us|[plain flour, gro...|
|20130|   filipino|[eggs, pepper, sa...|
|22213|     indian|[water, vegetable...|
|13162|     indian|[black pepper, sh...|
+-----+-----------+--------------------+
only showing top 5 rows

CPU times: user 8.9 ms, sys: 2.46 ms, total: 11.4 ms
Wall time: 8.17 s


### Conteo de Registros

In [14]:
train.count()

39774

### Manipulación de la Columna

Quitamos los arrays, para operar mejor el machine learning.

In [5]:
train2 = train\
    .withColumn("ingreds", 
                col("ingredients").cast(StringType()))\
    .withColumn("ingredientes",
               regexp_replace(col("ingreds"), pattern="[\[\]]", replacement=""))\
    .select("id", "cuisine", col("ingredientes").alias("ingredients"))
train2.write.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train2.parquet", mode="overwrite")

In [6]:
train2 = spark.read.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train2.parquet")
train2.show()

+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|romaine lettuce, ...|
|25693|southern_us|plain flour, grou...|
|20130|   filipino|eggs, pepper, sal...|
|22213|     indian|water, vegetable ...|
|13162|     indian|black pepper, sha...|
| 6602|   jamaican|plain flour, suga...|
|42779|    spanish|olive oil, salt, ...|
| 3735|    italian|sugar, pistachio ...|
|16903|    mexican|olive oil, purple...|
|12734|    italian|chopped tomatoes,...|
| 5875|    italian|pimentos, sweet p...|
|45887|    chinese|low sodium soy sa...|
| 2698|    italian|Italian parsley l...|
|41995|    mexican|ground cinnamon, ...|
|31908|    italian|fresh parmesan ch...|
|24717|     indian|tumeric, vegetabl...|
|34466|    british|greek yogurt, lem...|
| 1420|    italian|italian seasoning...|
| 2941|       thai|sugar, hot chili,...|
| 8152| vietnamese|soy sauce, vegeta...|
+-----+-----------+--------------------+
only showing top

## Procesamiento de Lenguaje Natural

### Carga de librerías Spark NLP 

* <https://github.com/JohnSnowLabs/spark-nlp/issues/106>
* <https://stackoverflow.com/questions/34302314/no-module-name-pyspark-error>

In [7]:
## setup sparknlp source
## 
## https://github.com/JohnSnowLabs/spark-nlp/issues/106
## https://stackoverflow.com/questions/34302314/no-module-name-pyspark-error
import os, glob, sys
sys.path.extend(glob.glob("/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-nlp_2.11-1.5.0.jar"))
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
import os, glob, sys
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth

### Pipeline

In [8]:
docAssemblr = DocumentAssembler()\
  .setInputCol("ingredients")\
  .setOutputCol("document")

tokenizr = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")#    .addInfixPattern("(\p{L}+)(n't\b)") \
    
normalizr = Normalizer() \
    .setInputCols(["tokens"]) \
    .setOutputCol("normalized") \
    .setPattern("[^A-Za-z,]")
    
stemmr = Stemmer() \
  .setInputCols(["normalized"]) \
  .setOutputCol("stems")
    
finishr = Finisher() \
    .setInputCols(["stems"]) \
    .setOutputCols(["ingredients"]) \
    .setIncludeKeys(False)

pipeline = Pipeline(stages = [
    docAssemblr,
    tokenizr, 
    normalizr,
    stemmr,
    finishr
])

train.cache()
model = pipeline.fit(train2)
train3 = model.transform(train2)
train3.printSchema()
print("showing results...")
train3.show(2, truncate=False)

root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: string (nullable = true)

showing results...
+-----+-----------+-----------------------------------------------------------------------------------------------------------------------------------+
|id   |cuisine    |ingredients                                                                                                                        |
+-----+-----------+-----------------------------------------------------------------------------------------------------------------------------------+
|10259|greek      |romain@lettuc@,@black@oliv@,@grape@tomato@,@garlic@,@pepper@,@purpl@onion@,@season@,@garbanzo@bean@,@feta@chees@crumbl             |
|25693|southern_us|plain@flour@,@ground@pepper@,@salt@,@tomato@,@ground@black@pepper@,@thym@,@egg@,@green@tomato@,@yellow@corn@meal@,@milk@,@veget@oil|
+-----+-----------+--------------------------------------------------------------------------------

### Canastas

En este primer paso lo que realizamos es volver a pasar los datos a array, para que sean canastas

In [9]:
%%time
udf_ingredients = udf(lambda ingredients: 
                      list(set(ingredients)), 
                      returnType=ArrayType(StringType()))
train4 = train3 \
    .withColumn("ingredients", regexp_replace("ingredients", "@?,@?", ",")) \
    .select("id", "cuisine",
        split("ingredients", "\s*,\s*").alias("ingredients")) \
    .cache() \
    .withColumn("ingredients", udf_ingredients("ingredients"))    
#.select( \
#        "id",
#        "cuisine",
#        regexp_replace("ingredients", "\@", "@ ").alias("ingredients")\
#    ) \
train4.write.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/tmp-train4.parquet", mode="overwrite")
train4 = spark.read.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/tmp-train4.parquet")
train4.printSchema()
train4.show()

root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|[purpl@onion, bla...|
|25693|southern_us|[veget@oil, salt,...|
|20130|   filipino|[chicken@liver, g...|
|22213|     indian|[water, veget@oil...|
|13162|     indian|[water, cayenn@pe...|
| 6602|   jamaican|[ground@ginger, f...|
|42779|    spanish|[flat@leaf@parsle...|
| 3735|    italian|[flour, white@alm...|
|16903|    mexican|[iceberg@lettuc, ...|
|12734|    italian|[flat@leaf@parsle...|
| 5875|    italian|[mushroom, canola...|
|45887|    chinese|[crush@red@pepper...|
| 2698|    italian|[italian@parslei@...|
|41995|    mexican|[avocado, crush@r...|
|31908|    italian|[allpurpos@flour,...|
|24717|     indian|[spinach, sweet@p...|
|34466|    british|[confection@sugar...|
| 1420|    

### Items frecuentes

In [10]:
%%time
fp = FPGrowth(minSupport=0.1, minConfidence=0.2, itemsCol="ingredients")
fpm = fp.fit(train4)

CPU times: user 15.1 ms, sys: 6.84 ms, total: 22 ms
Wall time: 4.14 s


In [11]:
fpm.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

+---------------------+-----+
|items                |freq |
+---------------------+-----+
|[salt]               |18048|
|[onion]              |7972 |
|[oliv@oil]           |7971 |
|[water]              |7457 |
|[garlic]             |7380 |
|[sugar]              |6434 |
|[garlic@clove]       |6236 |
|[butter]             |4847 |
|[ground@black@pepper]|4784 |
|[allpurpos@flour]    |4632 |
|[pepper]             |4438 |
|[onion, salt]        |4392 |
|[veget@oil]          |4385 |
|[oliv@oil, salt]     |4177 |
+---------------------+-----+



### Reglas de Asociación y Predicciones

Al parecer no se observan reglas de asociación. Por ende, no hay predicciones

In [12]:
fpm.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|    [salt]|[oliv@oil]|0.23143838652482268|
|    [salt]|   [onion]|0.24335106382978725|
|[oliv@oil]|    [salt]| 0.5240245891356166|
|   [onion]|    [salt]| 0.5509282488710486|
+----------+----------+-------------------+



In [13]:
fpm.transform(train4).show(10, truncate=False)

+-----+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|id   |cuisine    |ingredients                                                                                                                                                                                                                               |prediction       |
+-----+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|10259|greek      |[purpl@onion, black@oliv, season, romain@lettuc, pepper, garlic, feta@chees@crumbl, grape@tomato, garbanzo@bean]                                                  

Ejemplos con ingredientes arbitrarios
* **Salt**, Eggs

In [15]:
fpm.transform(spark.createDataFrame([(["salt", "eggs"], )], ["ingredients"])).show()

+------------+-----------------+
| ingredients|       prediction|
+------------+-----------------+
|[salt, eggs]|[oliv@oil, onion]|
+------------+-----------------+



In [16]:
spark.registerDataFrameAsTable(train3, "train3")
spark.registerDataFrameAsTable(train4, "train4")
ingredients_nonunique = spark\
    .sql("SELECT ingredients FROM train4 WHERE id = 1667 OR \
         array_contains(ingredients, 'old@ el@ paso@ mild@ red@ enchilada@ sauc@ ')") \
    .collect()[0].ingredients
ingredients_nonunique

['old@el@paso@mild@red@enchilada@sauc',
 'cook@chicken',
 'mexican@chees@blend',
 'pillsburi@refriger@crescent@dinner@roll',
 'red@enchilada@sauc',
 'refriger@crescent@roll']

* Lettuce, Tomato, **Olive@oil**

In [17]:
fpm.transform(spark.createDataFrame([(["lettuce", "tomato", "oliv@oil"], )], ["ingredients"])).show(truncate=False)

+---------------------------+----------+
|ingredients                |prediction|
+---------------------------+----------+
|[lettuce, tomato, oliv@oil]|[salt]    |
+---------------------------+----------+



In [18]:
%%time
train4.select(explode("ingredients")).distinct().count()

CPU times: user 8.19 ms, sys: 1.03 ms, total: 9.22 ms
Wall time: 2.62 s


6681

In [3]:
%%time
# norvig = NorvigSweetingModel().pretrained(name="spell_fast", language="en")
path_dict = "file:/home/jaa6766/enwiki-latest-all-titles-in-ns0-transform"
# path_dict = "hdfs://jupyter.corp.penoles.mx:9000/spell-dicts/enwiki-latest-all-titles-in-ns0-transform"
norvig = NorvigSweetingApproach()
norvig.setInputCols(["tokens"])
norvig.setOutputCol("ingredients2")
# norvig.setCorpus("hdfs://jupyter.corp.penoles.mx:9000/spell-dicts/enwiki-latest-all-titles-in-ns0-transform")
norvig.setDictionary(path_dict)
pipeline1 = Pipeline(stages = [
    docAssemblr,
    tokenizr, 
    norvig
#     normalizr,
#     stemmr,
#     finishr
])
model1 = pipeline1.fit(train2)
train4 = model1.transform(train2)
train4.show(2)

SyntaxError: invalid syntax (<unknown>, line 9)

# Borrar

Pruebas norvig

In [None]:
sc.stop()

In [4]:
airlines = spark.read.csv("s3a://jorge-altamirano/flights/airlines.csv")
airlines.show()

+---------+--------------------+
|      _c0|                 _c1|
+---------+--------------------+
|IATA_CODE|             AIRLINE|
|       UA|United Air Lines ...|
|       AA|American Airlines...|
|       US|     US Airways Inc.|
|       F9|Frontier Airlines...|
|       B6|     JetBlue Airways|
|       OO|Skywest Airlines ...|
|       AS|Alaska Airlines Inc.|
|       NK|    Spirit Air Lines|
|       WN|Southwest Airline...|
|       DL|Delta Air Lines Inc.|
|       EV|Atlantic Southeas...|
|       HA|Hawaiian Airlines...|
|       MQ|American Eagle Ai...|
|       VX|      Virgin America|
+---------+--------------------+



In [2]:
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
import time, os, re, glob, sys
# https://spark.apache.org/docs/latest/configuration.html
conf = SparkConf()
conf.set("spark.worker.cleanup.appDataTtl", 24*60*60)
conf.set("spark.worker.cleanup.enabled", True)
conf.set("spark.driver.memory", "60g")
conf.set("spark.driver.cores", 5)
conf.set("spark.driver.memoryOverhead", 0.9)
conf.set("spark.executor.memory", "60g")
conf.set("spark.executor.cores", 5)
conf.set("spark.jars", "file:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-nlp_2.11-1.5.3.jar," +
         "file:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/config-1.3.0.jar," + #needed nlp
         "local:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/hadoop-common-2.7.3.jar," + #needed by aws
         "local:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/commons-cli-1.2.jar," + #needed by aws
         "file:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/hadoop-aws-2.7.3.jar," + #needed by aws
         "file:/usr/local/spark-2.3.0-bin-hadoop2.7/jars/aws-java-sdk-1.7.4.jar") #needed by aws
conf.set("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.5.3")
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
### get they creds to login to AWS :-)
HOME = os.environ["HOME"]
aws_id, aws_key = (None, None)
with open(HOME+"/.aws/credentials", "r") as f:
    for line in f:
        line = line.strip()
        if "aws_access_key_id" in line:
            aws_id = re.sub("^.*aws_access_key_id\s*=\s*", "", line)
        elif "aws_secret_access_key" in line:
            aws_key = re.sub("^.*aws_secret_access_key\s*=\s*", "", line)
conf.set("spark.hadoop.fs.s3a.access.key", aws_id)
conf.set("spark.hadoop.fs.s3a.secret.key", aws_key)
aws_id, aws_key = (None, None)
### end getting keys
sc = SparkContext(master = "spark://jupyter.corp.penoles.mx:7077", 
                  sparkHome="/usr/local/spark/",
                  appName="examen-ma-2", conf=conf)
spark = SQLContext(sc)
## setup sparknlp source
## 
## https://github.com/JohnSnowLabs/spark-nlp/issues/106
## https://stackoverflow.com/questions/34302314/no-module-name-pyspark-error
sys.path.extend(glob.glob("/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-nlp_2.11-1.5.3.jar"))
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth

train2 = spark.read.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train2.parquet").cache()
train2.show(1)

+-----+-------+--------------------+
|   id|cuisine|         ingredients|
+-----+-------+--------------------+
|10259|  greek|romaine lettuce, ...|
+-----+-------+--------------------+
only showing top 1 row



In [6]:
%%time
docAssemblr = DocumentAssembler()\
  .setInputCol("ingredients")\
  .setOutputCol("document")

tokenizr = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")#    .addInfixPattern("(\p{L}+)(n't\b)") \
    
normalizr = Normalizer() \
    .setInputCols(["tokens"]) \
    .setOutputCol("normalized") \
    .setPattern("[^A-Za-z,]")
    
path_dict = "file:/home/jaa6766/enwiki-latest-all-titles-in-ns0-transform"
# path_dict = "hdfs://jupyter.corp.penoles.mx:9000/spell-dicts/enwiki-latest-all-titles-in-ns0-transform"
norvig = NorvigSweetingApproach() \
    .setInputCols(["normalized"]) \
    .setOutputCol("ingredients2") \
    .setDictionary(path_dict)
# norvig.setCorpus("hdfs://jupyter.corp.penoles.mx:9000/spell-dicts/enwiki-latest-all-titles-in-ns0-transform")
    
stemmr2 = Stemmer() \
  .setInputCols(["ingredients2"]) \
  .setOutputCol("stems")
    
finishr2 = Finisher() \
    .setInputCols(["stems"]) \
    .setOutputCols(["ingredients3"]) \
    .setIncludeKeys(False) \
    .setAnnotationSplitSymbol(" ")

pipeline1 = Pipeline(stages = [
    docAssemblr,
    tokenizr, 
    normalizr,
    norvig,
    stemmr2,
    finishr2
])
model1 = pipeline1.fit(train2)
train4 = model1.transform(train2)
train4.show(2)

+-----+-----------+--------------------+--------------------+
|   id|    cuisine|         ingredients|        ingredients3|
+-----+-----------+--------------------+--------------------+
|10259|      greek|romaine lettuce, ...|romain lettuc , b...|
|25693|southern_us|plain flour, grou...|plain flour , gro...|
+-----+-----------+--------------------+--------------------+
only showing top 2 rows

CPU times: user 80.9 ms, sys: 21.7 ms, total: 103 ms
Wall time: 1min 6s


In [3]:
train4.select("ingredients3", "cuisine").show(4, truncate=False, vertical=True)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------
 ingredients3 | romain lettuc , black oliv , grape tomato , garlic , pepper , purpl onion , season , garbanzo bean , feta chees crumbl                            
 cuisine      | greek                                                                                                                                             
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------
 ingredients3 | plain flour , ground pepper , salt , tomato , ground black pepper , thym , egg , green tomato , yellow corn meal , milk , veget oil               
 cuisine      | southern_us                                                                                                                                       
-RECORD 2-------------

In [7]:
%%time
train4 = train4 \
    .select("ingredients3", "cuisine") \
    .coalesce(1)

CPU times: user 6.58 ms, sys: 2.25 ms, total: 8.82 ms
Wall time: 57.8 ms


In [7]:
train4.write.csv("s3a://jorge-altamirano/ma2018-examen2/clean-ingredients.csv", mode="overwrite")

## /fin Borrar

In [45]:
train2.select("cuisine").distinct().orderBy("cuisine").show()
train2.select("cuisine").distinct().count()

+------------+
|     cuisine|
+------------+
|   brazilian|
|     british|
|cajun_creole|
|     chinese|
|    filipino|
|      french|
|       greek|
|      indian|
|       irish|
|     italian|
|    jamaican|
|    japanese|
|      korean|
|     mexican|
|    moroccan|
|     russian|
| southern_us|
|     spanish|
|        thai|
|  vietnamese|
+------------+



Py4JJavaError: An error occurred while calling o305.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 27 (count at NativeMethodAccessorImpl.java:0) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: Failed to connect to /10.10.208.212:42474 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:519) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:450) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:61) 	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.agg_doAggregateWithoutKey$(Unknown Source) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614) 	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253) 	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247) 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 	at org.apache.spark.scheduler.Task.run(Task.scala:109) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at java.lang.Thread.run(Thread.java:748) Caused by: java.io.IOException: Failed to connect to /10.10.208.212:42474 	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:245) 	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:187) 	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:113) 	at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:141) 	at org.apache.spark.network.shuffle.RetryingBlockFetcher.lambda$initiateRetry$0(RetryingBlockFetcher.java:169) 	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 	at java.util.concurrent.FutureTask.run(FutureTask.java:266) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138) 	... 1 more Caused by: io.netty.channel.AbstractChannel$AnnotatedNoRouteToHostException: No route to host: /10.10.208.212:42474 	at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) 	at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) 	at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:323) 	at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:340) 	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:633) 	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:580) 	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:497) 	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:459) 	at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858) 	... 2 more Caused by: java.net.NoRouteToHostException: No route to host 	... 11 more 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1368)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1817)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2770)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2769)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2769)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [35]:
!cat wikipedia-title-corpus-download.sh

#!/bin/sh
#english español Deutsch portugués francais tagalog italiano vietnamese
for lang in en es de pt fr lt it vi; 
do
	echo "Downloading $lang"
	wget -c "https://dumps.wikimedia.org/${lang}wiki/latest/${lang}wiki-latest-all-titles-in-ns0.gz"
	echo -n Transforming $lang...
	zcat ${lang}wiki-latest-all-titles-in-ns0.gz | \
		 sed 's!_\+! !g;s![^a-z ]!!ig;s!^\s\+!!;s!\s\+$!!;/^\s*$/d' | \
		 tr '[:upper:]' '[:lower:]' | tr ' ' '\n' | sort -u > "${lang}wiki-latest-all-titles-in-ns0-transform"
	echo " done!"
done


# Fin del Cluster

In [None]:
sc.stop()

## Bibliografía

* [Notas del Curso Métodos Analíticos, Luis Felipe González, ITAM Primavera 2018](https://clever-mestorf-ee3f54.netlify.com)
* <https://github.com/JohnSnowLabs/spark-nlp/blob/master/python/example/model-downloader/ModelDownloaderExample.ipynb>
* <https://nlp.johnsnowlabs.com/components.html>
* <https://nlp.johnsnowlabs.com/notebooks.html>
* <https://github.com/JohnSnowLabs/spark-nlp/blob/1.5.0/python/example/vivekn-sentiment/sentiment.ipynb>
* [Indix - Lessons from Using Spark to Process Large Amounts of Data – Part I. Retrieved 2018-05-14](https://www.indix.com/blog/engineering/lessons-from-using-spark-to-process-large-amounts-of-data-part-i/)
* [Spark NLP - Dependencies](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp_2.11/1.5.3)
* [StackOverflow: Troubleshotting on Spark](https://stackoverflow.com/a/36903019/7323086)