# Examen 2

_ 175904 - Jorge III Altamirano Astorga_


## Carga de los Datos

In [1]:
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import *
from pyspark.sql import DataFrameStatFunctions, DataFrame
from pyspark.sql.types import *
import time

In [2]:
# https://spark.apache.org/docs/latest/configuration.html
conf = SparkConf()
# conf.set("spark.driver.memory", "16g")
# conf.set("spark.driver.cores", 4)
# conf.set("spark.driver.memoryOverhead", 0.9)
# conf.set("spark.executor.memory", "32g")
# conf.set("spark.executor.cores", 12)
conf.set("spark.jars", "local:/home/jaa6766/spark-nlp_2.11-1.5.0.jar")
conf.set("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.5.0")
sc = SparkContext(master = "spark://jupyter.corp.penoles.mx:7077", 
                  sparkHome="/usr/local/spark/",
                  appName="examen-ma-2", conf=conf)
spark = SQLContext(sc)
sc.version

'2.3.0'

In [9]:
sc.stop()

In [3]:
%%time 
schema_ingredientes = schema=StructType().\
    add("id", data_type=StringType(), nullable=False, metadata=None).\
    add("cuisine", data_type=StringType(), nullable=False, metadata=None).\
    add("ingredients", data_type=ArrayType(StringType()), nullable=True, metadata=None)
train = spark.read.json("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train.json", 
                        schema=schema_ingredientes,
                        allowUnquotedFieldNames=True,
                        multiLine=True)
print("Schema:")
train.printSchema()
print("Show:")
train.show(5)

Schema:
root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

Show:
+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|[romaine lettuce,...|
|25693|southern_us|[plain flour, gro...|
|20130|   filipino|[eggs, pepper, sa...|
|22213|     indian|[water, vegetable...|
|13162|     indian|[black pepper, sh...|
+-----+-----------+--------------------+
only showing top 5 rows

CPU times: user 14.8 ms, sys: 8.25 ms, total: 23 ms
Wall time: 10.8 s


### Conteo de Registros

In [4]:
train.count()

39774

### Manipulación de la Columna

Quitamos los arrays, para operar mejor el machine learning.

In [5]:
train2 = train\
    .withColumn("ingreds", 
                col("ingredients").cast(StringType()))\
    .withColumn("ingredientes",
               regexp_replace(col("ingreds"), pattern="[\[\]]", replacement=""))\
    .select("id", "cuisine", col("ingredientes").alias("ingredients"))
train2.write.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train2.parquet", mode="overwrite")

In [6]:
train2 = spark.read.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/train2.parquet")
train2.show()

+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|romaine lettuce, ...|
|25693|southern_us|plain flour, grou...|
|20130|   filipino|eggs, pepper, sal...|
|22213|     indian|water, vegetable ...|
|13162|     indian|black pepper, sha...|
| 6602|   jamaican|plain flour, suga...|
|42779|    spanish|olive oil, salt, ...|
| 3735|    italian|sugar, pistachio ...|
|16903|    mexican|olive oil, purple...|
|12734|    italian|chopped tomatoes,...|
| 5875|    italian|pimentos, sweet p...|
|45887|    chinese|low sodium soy sa...|
| 2698|    italian|Italian parsley l...|
|41995|    mexican|ground cinnamon, ...|
|31908|    italian|fresh parmesan ch...|
|24717|     indian|tumeric, vegetabl...|
|34466|    british|greek yogurt, lem...|
| 1420|    italian|italian seasoning...|
| 2941|       thai|sugar, hot chili,...|
| 8152| vietnamese|soy sauce, vegeta...|
+-----+-----------+--------------------+
only showing top

## Procesamiento de Lenguaje Natural

### Carga de librerías Spark NLP 

* <https://github.com/JohnSnowLabs/spark-nlp/issues/106>
* <https://stackoverflow.com/questions/34302314/no-module-name-pyspark-error>

In [7]:
## setup sparknlp source
## 
## https://github.com/JohnSnowLabs/spark-nlp/issues/106
## https://stackoverflow.com/questions/34302314/no-module-name-pyspark-error
import os, glob, sys
sys.path.extend(glob.glob("/home/jaa6766/spark-nlp_2.11-1.5.0.jar"))
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
import os, glob, sys
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth

### Pipeline

In [44]:
docAssemblr = DocumentAssembler()\
  .setInputCol("ingredients")\
  .setOutputCol("document")

tokenizr = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")#    .addInfixPattern("(\p{L}+)(n't\b)") \
    
normalizr = Normalizer() \
    .setInputCols(["tokens"]) \
    .setOutputCol("normalized") \
    .setPattern("[^A-Za-z,]")
    
stemmr = Stemmer() \
  .setInputCols(["normalized"]) \
  .setOutputCol("stems")
    
finishr = Finisher() \
    .setInputCols(["stems"]) \
    .setOutputCols(["ingredients"]) \
    .setIncludeKeys(False)

pipeline = Pipeline(stages = [
    docAssemblr,
    tokenizr, 
    normalizr,
    stemmr,
    finishr
])

train.cache()
model = pipeline.fit(train2)
train3 = model.transform(train2)
train3.printSchema()
print("showing results...")
train3.show(2, truncate=False)

root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: string (nullable = true)

showing results...
+-----+-----------+-----------------------------------------------------------------------------------------------------------------------------------+
|id   |cuisine    |ingredients                                                                                                                        |
+-----+-----------+-----------------------------------------------------------------------------------------------------------------------------------+
|10259|greek      |romain@lettuc@,@black@oliv@,@grape@tomato@,@garlic@,@pepper@,@purpl@onion@,@season@,@garbanzo@bean@,@feta@chees@crumbl             |
|25693|southern_us|plain@flour@,@ground@pepper@,@salt@,@tomato@,@ground@black@pepper@,@thym@,@egg@,@green@tomato@,@yellow@corn@meal@,@milk@,@veget@oil|
+-----+-----------+--------------------------------------------------------------------------------

### Canastas

En este primer paso lo que realizamos es volver a pasar los datos a array, para que sean canastas

In [11]:
%%time
udf_ingredients = udf(lambda ingredients: 
                      list(set(ingredients)), 
                      returnType=ArrayType(StringType()))
train4 = train3 \
    .withColumn("ingredients", regexp_replace("ingredients", "@?,@?", ",")) \
    .select("id", "cuisine",
        split("ingredients", "\s*,\s*").alias("ingredients")) \
    .cache() \
    .withColumn("ingredients", udf_ingredients("ingredients"))    
#.select( \
#        "id",
#        "cuisine",
#        regexp_replace("ingredients", "\@", "@ ").alias("ingredients")\
#    ) \
train4.write.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/tmp-train4.parquet", mode="overwrite")
train4 = spark.read.parquet("hdfs://jupyter.corp.penoles.mx:9000/ma2018-examen2/tmp-train4.parquet")
train4.printSchema()
train4.show()

root
 |-- id: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-----+-----------+--------------------+
|   id|    cuisine|         ingredients|
+-----+-----------+--------------------+
|10259|      greek|[purpl@onion, bla...|
|25693|southern_us|[veget@oil, salt,...|
|20130|   filipino|[chicken@liver, g...|
|22213|     indian|[water, veget@oil...|
|13162|     indian|[water, cayenn@pe...|
| 6602|   jamaican|[ground@ginger, f...|
|42779|    spanish|[flat@leaf@parsle...|
| 3735|    italian|[flour, white@alm...|
|16903|    mexican|[iceberg@lettuc, ...|
|12734|    italian|[flat@leaf@parsle...|
| 5875|    italian|[mushroom, canola...|
|45887|    chinese|[crush@red@pepper...|
| 2698|    italian|[italian@parslei@...|
|41995|    mexican|[avocado, crush@r...|
|31908|    italian|[allpurpos@flour,...|
|24717|     indian|[spinach, sweet@p...|
|34466|    british|[confection@sugar...|
| 1420|    

### Items frecuentes

In [12]:
%%time
fp = FPGrowth(minSupport=0.1, minConfidence=0.2, itemsCol="ingredients")
fpm = fp.fit(train4)

CPU times: user 14.3 ms, sys: 3.55 ms, total: 17.8 ms
Wall time: 5.23 s


In [13]:
fpm.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

+---------------------+-----+
|items                |freq |
+---------------------+-----+
|[salt]               |18048|
|[onion]              |7972 |
|[oliv@oil]           |7971 |
|[water]              |7457 |
|[garlic]             |7380 |
|[sugar]              |6434 |
|[garlic@clove]       |6236 |
|[butter]             |4847 |
|[ground@black@pepper]|4784 |
|[allpurpos@flour]    |4632 |
|[pepper]             |4438 |
|[onion, salt]        |4392 |
|[veget@oil]          |4385 |
|[oliv@oil, salt]     |4177 |
+---------------------+-----+



### Reglas de Asociación y Predicciones

Al parecer no se observan reglas de asociación. Por ende, no hay predicciones

In [14]:
fpm.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|    [salt]|[oliv@oil]|0.23143838652482268|
|    [salt]|   [onion]|0.24335106382978725|
|[oliv@oil]|    [salt]| 0.5240245891356166|
|   [onion]|    [salt]| 0.5509282488710486|
+----------+----------+-------------------+



In [16]:
fpm.transform(train4).show(10, truncate=False)

+-----+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|id   |cuisine    |ingredients                                                                                                                                                                                                                               |prediction       |
+-----+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|10259|greek      |[purpl@onion, black@oliv, season, romain@lettuc, pepper, garlic, feta@chees@crumbl, grape@tomato, garbanzo@bean]                                                  

Ejemplos con ingredientes arbitrarios
* **Salt**, Eggs

In [17]:
fpm.transform(spark.createDataFrame([(["salt", "eggs"], )], ["ingredients"])).show()

+------------+-----------------+
| ingredients|       prediction|
+------------+-----------------+
|[salt, eggs]|[oliv@oil, onion]|
+------------+-----------------+



In [18]:
from pyspark.sql import *
spark.registerDataFrameAsTable(train3, "train3")
spark.registerDataFrameAsTable(train4, "train4")
ingredients_nonunique = spark\
    .sql("SELECT ingredients FROM train4 WHERE id = 1667 OR \
         array_contains(ingredients, 'old@ el@ paso@ mild@ red@ enchilada@ sauc@ ')") \
    .collect()[0].ingredients
ingredients_nonunique

['old@el@paso@mild@red@enchilada@sauc',
 'cook@chicken',
 'mexican@chees@blend',
 'pillsburi@refriger@crescent@dinner@roll',
 'red@enchilada@sauc',
 'refriger@crescent@roll']

* Lettuce, Tomato, **Olive@oil**

In [19]:
fpm.transform(spark.createDataFrame([(["lettuce", "tomato", "oliv@oil"], )], ["ingredients"])).show(truncate=False)

+---------------------------+----------+
|ingredients                |prediction|
+---------------------------+----------+
|[lettuce, tomato, oliv@oil]|[salt]    |
+---------------------------+----------+



In [20]:
%%time
train4.select(explode("ingredients")).distinct().count()

CPU times: user 7.24 ms, sys: 2.41 ms, total: 9.65 ms
Wall time: 2.39 s


6681

In [64]:
from sparknlp.annotator import NorvigSweetingModel

In [79]:
norvig = NorvigSweetingModel().pretrained(name="spell_fast", language="en")
norvig.setInputCols(["tokens"])
norvig.setOutputCol("ingredients2")
pipeline1 = Pipeline(stages = [
    docAssemblr,
    tokenizr, 
    norvig
#     normalizr,
#     stemmr,
#     finishr
])
model1 = pipeline1.fit(train2)
train4 = model1.transform(train2)
train4.show(2)

Exception ignored in: <bound method JavaParams.__del__ of NorvigSweetingModel_4412ab6c985cb6aeb72c>
Traceback (most recent call last):
  File "/opt/intel/intelpython3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
  File "/opt/intel/intelpython3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1897, in detach
    java_object._detach()
AttributeError: 'NoneType' object has no attribute '_detach'


Py4JJavaError: An error occurred while calling o2273.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 65.0 failed 4 times, most recent failure: Lost task 0.3 in stage 65.0 (TID 330, 10.10.208.210, executor 0): java.lang.NoClassDefFoundError: Lcom/typesafe/config/Config;
	at java.lang.Class.getDeclaredFields0(Native Method)
	at java.lang.Class.privateGetDeclaredFields(Class.java:2583)
	at java.lang.Class.getDeclaredField(Class.java:2068)
	at java.io.ObjectStreamClass.getDeclaredSUID(ObjectStreamClass.java:1803)
	at java.io.ObjectStreamClass.access$700(ObjectStreamClass.java:79)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:494)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:482)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:482)
	at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:379)
	at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:669)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1875)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1744)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2032)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1965)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1560)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
	at sun.reflect.GeneratedMethodAccessor9.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1158)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2168)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
	at sun.reflect.GeneratedMethodAccessor9.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1158)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2168)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.typesafe.config.Config
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	... 80 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.GeneratedMethodAccessor78.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NoClassDefFoundError: Lcom/typesafe/config/Config;
	at java.lang.Class.getDeclaredFields0(Native Method)
	at java.lang.Class.privateGetDeclaredFields(Class.java:2583)
	at java.lang.Class.getDeclaredField(Class.java:2068)
	at java.io.ObjectStreamClass.getDeclaredSUID(ObjectStreamClass.java:1803)
	at java.io.ObjectStreamClass.access$700(ObjectStreamClass.java:79)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:494)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:482)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:482)
	at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:379)
	at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:669)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1875)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1744)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2032)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1965)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1560)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
	at sun.reflect.GeneratedMethodAccessor9.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1158)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2168)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
	at sun.reflect.GeneratedMethodAccessor9.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1158)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2168)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.ClassNotFoundException: com.typesafe.config.Config
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	... 80 more


# Fin del Cluster

In [None]:
sc.stop()

## Bibliografía

* [Notas del Curso Métodos Analíticos, Luis Felipe González, ITAM Primavera 2018](https://clever-mestorf-ee3f54.netlify.com)
* <https://github.com/JohnSnowLabs/spark-nlp/blob/master/python/example/model-downloader/ModelDownloaderExample.ipynb>
* <https://nlp.johnsnowlabs.com/components.html>
* <https://nlp.johnsnowlabs.com/notebooks.html>
* <https://github.com/JohnSnowLabs/spark-nlp/blob/1.5.0/python/example/vivekn-sentiment/sentiment.ipynb>
