In [3]:
import sparknlp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.annotator import LemmatizerModel
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF

In [4]:
import os
os.environ['HADOOP_HOME'] = 'C:\hadoop'
os.environ['JAVA_HOME'] = 'C:\Program Files\OpenLogic\jdk-17.0.13.11-hotspot'

In [5]:
sparkSession = sparknlp.start()

In [None]:
df = sparkSession.createDataFrame([(1,'TimeWarner said fourth quarter sales rose 2%'),
                                   (2, 'to $11.1bn from $10.9bn.'),
                                   (3, 'For the full-year, TimeWarner posted a profit of $3.36bn')],
                                schema = StructType([StructField('id', IntegerType(), True),
                                                    StructField('text', StringType(), True)]))
df.show()                                   

+---+--------------------+
| id|                text|
+---+--------------------+
|  1|TimeWarner said f...|
|  2|to $11.1bn from $...|
|  3|For the full-year...|
+---+--------------------+



In [None]:
assemblerConfig = sparknlp.DocumentAssembler().setInputCol('text')\
                                            .setOutputCol('document')
dfAssembled = assemblerConfig.transform(df)
dfAssembled.show(truncate = False)

+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+
|id |text                                                    |document                                                                                          |
+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+
|1  |TimeWarner said fourth quarter sales rose 2%            |[{document, 0, 43, TimeWarner said fourth quarter sales rose 2%, {sentence -> 0}, []}]            |
|2  |to $11.1bn from $10.9bn.                                |[{document, 0, 23, to $11.1bn from $10.9bn., {sentence -> 0}, []}]                                |
|3  |For the full-year, TimeWarner posted a profit of $3.36bn|[{document, 0, 55, For the full-year, TimeWarner posted a profit of $3.36bn, {sentence -> 0}, []}]|
+---+-----------------------

In [None]:
tokenizerConfig = Tokenizer().setInputCols(['document'])\
                                    .setOutputCol('token')
tokenizerPipeline = Pipeline().setStages([tokenizerConfig])
dfTokenized = tokenizerPipeline.fit(dfAssembled).transform(dfAssembled)
dfTokenized.show(truncate = False)

+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |text                                                    |document                                                                                          |token                                                                                                                                                                                                                                    

In [None]:
wordsCleanerConfig = StopWordsCleaner().setInputCols(['token'])\
                                        .setOutputCol('token_cleaned')\
                                        .setCaseSensitive(False)\
                                        .setStopWords(['of', 'to'])
dfCleaned = wordsCleanerConfig.transform(dfTokenized)
dfCleaned.show(truncate = False)

+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
normalizerConfig = Normalizer().setInputCols(['token_cleaned'])\
                                .setOutputCol('token_normalized')\
                                .setLowercase(True)\
                                .setCleanupPatterns(['[^\w\d\s]'])
pipeLineNormalizer = Pipeline().setStages([normalizerConfig])
dfNormalized = pipeLineNormalizer.fit(dfCleaned).transform(dfCleaned)
dfNormalized.show(truncate = False)

+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#No sigo las instrucciones del curso, que implican ejecutar un script para descargar un modelo preentrenado de johnsnowlabs
#Dejo esto fuera del pipeline ya que trabajará sobre le texto origininal, no sobre el texto tokenizado
#Aunque se puede definir una clase personalizada para convertirlo en un Transformer y que opere dentro del pipeline
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])
#End lematize

lemmatize_udf = udf(lemmatize, StringType())
dfLemmatized = dfNormalized.withColumn("text_lemmatized", lemmatize_udf(dfNormalized["text"]))
dfLemmatized.show(truncate = False)

+---+--------------------------------------------------------+--------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
finisherConfig = Finisher().setInputCols(['token_normalized'])\
                            .setOutputCols(['final_token'])\
                            .setCleanAnnotations(True)
dfFinished = finisherConfig.transform(dfNormalized)
dfFinished.show(truncate = False)

+---+--------------------------------------------------------+----------------------------------------------------------+
|id |text                                                    |final_token                                               |
+---+--------------------------------------------------------+----------------------------------------------------------+
|1  |TimeWarner said fourth quarter sales rose 2%            |[timewarner, said, fourth, quarter, sales, rose, 2]       |
|2  |to $11.1bn from $10.9bn.                                |[111bn, from, 109bn]                                      |
|3  |For the full-year, TimeWarner posted a profit of $3.36bn|[for, the, fullyear, timewarner, posted, a, profit, 336bn]|
+---+--------------------------------------------------------+----------------------------------------------------------+



In [None]:
pipeLineComplete = Pipeline().setStages([assemblerConfig, tokenizerConfig, wordsCleanerConfig, normalizerConfig, finisherConfig])
modelComplete = pipeLineComplete.fit(df)
dfComplete = modelComplete.transform(df)
dfComplete.show(truncate = False)

+---+--------------------------------------------------------+----------------------------------------------------------+
|id |text                                                    |final_token                                               |
+---+--------------------------------------------------------+----------------------------------------------------------+
|1  |TimeWarner said fourth quarter sales rose 2%            |[timewarner, said, fourth, quarter, sales, rose, 2]       |
|2  |to $11.1bn from $10.9bn.                                |[111bn, from, 109bn]                                      |
|3  |For the full-year, TimeWarner posted a profit of $3.36bn|[for, the, fullyear, timewarner, posted, a, profit, 336bn]|
+---+--------------------------------------------------------+----------------------------------------------------------+



In [None]:
dfCountVector = dfFinished.select('id', 'final_token')
countVectorizerConfig = CountVectorizer().setInputCol('final_token')\
                                        .setOutputCol('features')
countVectorizedModel = countVectorizerConfig.fit(dfCountVector)
dfCountVectorized = countVectorizedModel.transform(dfCountVector)
dfCountVectorized.show(truncate = False)

+---+----------------------------------+---------------------------------------+
|id |final_token                       |features                               |
+---+----------------------------------+---------------------------------------+
|1  |[Hola, compañero, ¿cómo, estás, ?]|(13,[3,5,6,7,10],[1.0,1.0,1.0,1.0,1.0])|
|2  |[estoy, bastante, bien, gracias]  |(13,[2,4,8,11],[1.0,1.0,1.0,1.0])      |
|3  |[Vamos, hacer, ejercicio, NLP]    |(13,[0,1,9,12],[1.0,1.0,1.0,1.0])      |
+---+----------------------------------+---------------------------------------+



In [None]:
hashingTFConfig = HashingTF(inputCol= 'final_token',
                            outputCol = 'hashed_features',
                            numFeatures= 20)
dfHashed = hashingTFConfig.transform(dfFinished)
dfHashed.show(truncate = False)

+---+-----------------------------------+----------------------------------+----------------------------------------+
|id |text                               |final_token                       |hashed_features                         |
+---+-----------------------------------+----------------------------------+----------------------------------------+
|1  |Hola compañero, ¿cómo estás?       |[Hola, compañero, ¿cómo, estás, ?]|(20,[0,7,9,17,18],[1.0,1.0,1.0,1.0,1.0])|
|2  |Yo estoy bastante bien, gracias    |[estoy, bastante, bien, gracias]  |(20,[5,9,11,17],[1.0,1.0,1.0,1.0])      |
|3  |Vamos a hacer este ejercicio de NLP|[Vamos, hacer, ejercicio, NLP]    |(20,[5,6,12,15],[1.0,1.0,1.0,1.0])      |
+---+-----------------------------------+----------------------------------+----------------------------------------+



In [None]:
idfConfig = IDF(inputCol = 'hashed_features',
                outputCol = 'idf_features')
idfModel = idfConfig.fit(dfHashed)
idfResult = idfModel.transform(dfHashed)
idfResult.show(truncate = False)

+---+-----------------------------------+----------------------------------+----------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|id |text                               |final_token                       |hashed_features                         |idf_features                                                                                                         |
+---+-----------------------------------+----------------------------------+----------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|1  |Hola compañero, ¿cómo estás?       |[Hola, compañero, ¿cómo, estás, ?]|(20,[0,7,9,17,18],[1.0,1.0,1.0,1.0,1.0])|(20,[0,7,9,17,18],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.6931471805599453])|
|2  |Yo estoy bastante bien, gracias    |[estoy, bastant