# Pyspark: Natural Language Processing with Disaster Tweets

https://www.kaggle.com/c/nlp-getting-started

In [1]:
import time
import numpy as np

In [1]:
# The config below is for https://microsoft.github.io/SynapseML/
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.azure:synapseml:0.9.1") \
            .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
            .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x7f9d54756730>


In [2]:
trainPath = 'nlp-getting-started/train.csv'
testPath = 'nlp-getting-started/test.csv'

trainData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(trainPath)
testData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(testPath)

print('Number of row in Training:', trainData.count())
print('Number of row in Test:    ', testData.count())

Number of row in Training: 7613
Number of row in Test:     3263


## Preprocessing

Note that we only create several new features from existing features. Adding more new features clearly helps to understand more the dataset.

In [3]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  
from pyspark.ml import Pipeline 
import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec, StringIndexer,OneHotEncoder, VectorAssembler, RobustScaler

### Processing Null

We create a custom transformer to replace Null. In the case of "keyword" and "location", we replace null value by a symbole e.g. "\$" not an empty string "" since OneHotEncoder has an error with an empty string.

In [4]:
class FillNanTransformer(Transformer, HasInputCols, DefaultParamsReadable, DefaultParamsWritable):
    nanReplacement = Param(Params._dummy(), "nanReplacement", "nanReplacement", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCols=None, nanReplacement=None):
        super(FillNanTransformer, self).__init__()
        self._setDefault(nanReplacement="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, nanReplacement=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def getNanReplacement(self):
        return self.getOrDefault(self.nanReplacement)
    
    def _transform(self, dataset):
        nanReplacement = self.getNanReplacement()
        dataset = dataset.na.fill(value=nanReplacement,subset=self.getInputCols())
        return dataset

In [5]:
fillNanTransformer = FillNanTransformer(inputCols=["keyword", "location"], nanReplacement="$")
textFillNanTransformer = FillNanTransformer(inputCols=["text"], nanReplacement="")

### Processing urls in "text"

We remove urls from "text" and create a new column to verify if a tex contains an url.

In [6]:
class RemovePatternTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    pattern = Param(Params._dummy(), "pattern", "pattern", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, pattern=None):
        super(RemovePatternTransformer, self).__init__()
        self._setDefault(pattern="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, pattern=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def getPattern(self):
        return self.getOrDefault(self.pattern)
    
    def _transform(self, dataset):
        pattern = self.getPattern()
        dataset = dataset.withColumn(self.getOutputCol(), F.regexp_replace(F.col(self.getInputCol()), pattern, ""))
        return dataset
    
class CheckPatternTransformer(Transformer, HasInputCol, HasOutputCol):
    pattern = Param(Params._dummy(), "pattern", "pattern", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, pattern=None):
        super(CheckPatternTransformer, self).__init__()
        self._setDefault(pattern="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, pattern=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def getPattern(self):
        return self.getOrDefault(self.pattern)
    
    def _transform(self, dataset):
        pattern = self.getPattern()
        dataset = dataset.withColumn(self.getOutputCol(), F.when(F.col(self.getInputCol()).rlike(pattern),1.).otherwise(0.))
        return dataset

In [7]:
removeUrlTransformer = RemovePatternTransformer(inputCol="text", outputCol="textNoUrl", pattern="(https?://\S+)")
checkUrlTransformer = CheckPatternTransformer(inputCol="text", outputCol="textIsContainedUrl", pattern="(https?://\S+)")

### Get lengths for "keyword" and "text" (without urls)

In [8]:
class GetLengthTransformer(Transformer, HasInputCols, HasOutputCols):
    @keyword_only
    def __init__(self, inputCols=None, outputCols=None):
        super(GetLengthTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, outputCols=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        for inputCol, outputCol in zip(self.getInputCols(), self.getOutputCols()):
            dataset = dataset.withColumn(outputCol, F.length(inputCol))
        return dataset

In [9]:
getLengthTransformer = GetLengthTransformer(inputCols=["keyword","textNoUrl"], outputCols=["keywordLen", "textNoUrlLen"])

### Processing discrete features: "keyword", "location" and length features

Indexing "keyword" and "location"

In [10]:
keywordIndexer = StringIndexer(inputCol="keyword", outputCol="keywordIndex", handleInvalid="keep")
locationIndexer = StringIndexer(inputCol="location", outputCol="locationIndex", handleInvalid="keep")

### Processing "text" (without urls)

We remove special characters and stopwords, then use word2vec to obtain a vector of "textNoUrl"

In [11]:
regexTokenizer = RegexTokenizer(inputCol="textNoUrl", outputCol="textArrayWord", pattern="\\W")

stopWordsRemover = StopWordsRemover(inputCol="textArrayWord", outputCol="textNoSW")
word2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="textNoSW", outputCol="textVec")

We concatenate strings in "keyword", "location", "textNoUrl" and then apply the same procedure as for "textNoUrl"

In [12]:
class ConcatenateTransformer(Transformer, HasInputCols, HasOutputCol):
    @keyword_only
    def __init__(self, inputCols=None, outputCol=None):
        super(ConcatenateTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        dataset = dataset.withColumn(self.getOutputCol(), F.col(self.getInputCols()[0]))
        for colName in self.getInputCols()[1:]:
            dataset = dataset.withColumn(self.getOutputCol(), 
                F.concat_ws('@', F.col(self.getOutputCol()), F.col(colName)))
        return dataset

In [13]:
concatStringTransformer = ConcatenateTransformer(inputCols=["keyword", "location", "textNoUrl"], outputCol="concatString")
concatStringRegexTokenizer = RegexTokenizer(inputCol="concatString", outputCol="concatStringArrayWord", pattern="\\W")
concatStringStopWordsRemover = StopWordsRemover(inputCol="concatStringArrayWord", outputCol="concatStringArrayWordNoSW")

We would like observe effects of removing stopwords by generating two vectors "concatStringArrayWord" containing stopwords and "concatStringArrayWordNoSW" without stopwords

In [14]:
concatStringWord2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="concatStringArrayWord", outputCol="concatStringVec")
concatStringNoSWWord2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="concatStringArrayWordNoSW", outputCol="concatStringNoSWVec")

### Combining several features

We would like to observe how these feature sets affect the performance. Note that we scale these feature sets by using RobustScaler.

**"discreteFeatures"**: we only use discrete features

In [15]:
discreteFeaturesAssembler = VectorAssembler(inputCols=["keywordIndex", "locationIndex", "textIsContainedUrl",
                                                      "keywordLen", "textNoUrlLen"], 
                                            outputCol="discreteFeatures")


discreteFeaturesRobustScaler = RobustScaler(inputCol="discreteFeatures", outputCol="discreteFeaturesScale",
                                            withScaling=True, withCentering=True, lower=0.25, upper=0.75)

**"discreteOneHotEncoderFeatures"**: we use these discrete features one-hot-encoded.

In [16]:
oneHotEncoder = OneHotEncoder(inputCols=["keywordIndex", "locationIndex", "textIsContainedUrl"],
                              outputCols=["keywordVec", "locationVec", "textIsContainedUrlVec"],
                              handleInvalid="keep")

In [17]:
discreteOneHotEncoderFeaturesAssembler = VectorAssembler(inputCols=["keywordVec", "locationVec", "textIsContainedUrlVec",
                                                      "keywordLen", "textNoUrlLen"], 
                                            outputCol="discreteOneHotEncoderFeatures")


discreteOneHotEncoderFeaturesRobustScaler = RobustScaler(inputCol="discreteOneHotEncoderFeatures", outputCol="discreteOneHotEncoderFeaturesScale",
                                            withScaling=True, withCentering=True, lower=0.25, upper=0.75)

**"discreteAndTextFeatures"**: we add features of text in "discreteFeatures". Note that this could decay the effects of these discrete features.

In [18]:
discreteAndTextFeaturesAssembler = VectorAssembler(inputCols=["discreteFeatures", "textVec"],
                                                   outputCol="discreteAndTextFeatures")

discreteAndTextFeaturesRobustScaler = RobustScaler(inputCol="discreteAndTextFeatures", outputCol="discreteAndTextFeaturesScale",
                                                   withScaling=True, withCentering=True, lower=0.25, upper=0.75)

**"discreteOneHotEncoderAndTextFeatures"**: we add features of text in "discreteOneHotEncoderFeatures". Note that this could decay the effects of these discrete features.

In [19]:
discreteOneHotEncoderAndTextFeaturesAssembler = VectorAssembler(inputCols=["discreteOneHotEncoderFeatures", "textVec"],
                                                   outputCol="discreteOneHotEncoderAndTextFeatures")

discreteOneHotEncoderAndTextFeaturesRobustScaler = RobustScaler(inputCol="discreteOneHotEncoderAndTextFeatures", outputCol="discreteOneHotEncoderAndTextFeaturesScale",
                                                   withScaling=True, withCentering=True, lower=0.25, upper=0.75)

### Combining all preprocessing stages

In [20]:
preprocessingPipeline = Pipeline(stages=[fillNanTransformer, textFillNanTransformer,
                                         removeUrlTransformer, regexTokenizer, stopWordsRemover, word2Vec,
                                         keywordIndexer, locationIndexer, checkUrlTransformer, getLengthTransformer, 
                                         #oneHotEncoder,
                                         #concatStringTransformer, concatStringRegexTokenizer, concatStringStopWordsRemover, 
                                         #concatStringWord2Vec, concatStringNoSWWord2Vec,
                                         discreteFeaturesAssembler, 
                                         #discreteOneHotEncoderFeaturesAssembler, 
                                         discreteAndTextFeaturesAssembler,
                                         #discreteOneHotEncoderAndTextFeaturesAssembler,
                                         discreteFeaturesRobustScaler, 
                                         #discreteOneHotEncoderFeaturesRobustScaler,
                                         discreteAndTextFeaturesRobustScaler,
                                         #discreteOneHotEncoderAndTextFeaturesRobustScaler
                                        ])

## Training

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC
from synapse.ml.lightgbm import LightGBMClassifier

labelCol = "target"
evaluator = BinaryClassificationEvaluator(labelCol=labelCol, rawPredictionCol="prediction", metricName="areaUnderROC")

In [22]:
preprocessingModel = preprocessingPipeline.fit(trainData)

trainDataPreprocessed = preprocessingModel.transform(trainData)
testDataPreprocessed = preprocessingModel.transform(testData)

### Several observations on our feature sets

**Important features by using RandomForestClassifier**

In [26]:
featuresCol = "discreteFeaturesScale"

featuresImportanceModel = RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol).fit(trainDataPreprocessed)

for column in zip(discreteFeaturesAssembler.getInputCols(), list(featuresImportanceModel.featureImportances)):
     print(f"{column[0]:20}: {column[1]}")

keywordIndex        : 0.17150511557605724
locationIndex       : 0.019880210358211636
textIsContainedUrl  : 0.41729726391614824
keywordLen          : 0.2596097385419455
textNoUrlLen        : 0.13170767160763733


We see that containing an url in "text" has the highest impact on the target.  
*Note that we can generate more features based on n-gram, number of urls, etc.*

**We observe scores based on several feature sets:**
* "discreteFeaturesScale": "keywordIndex", "locationIndex", "keywordLen", "textNoUrlLen", "textIsContainedUrl"
* "discreteOneHotEncoderFeaturesScale": "discreteFeatures" after one-hot-encoding
* "vecText": vector of "text" (not containing urls) using Word2Vec
* "discreteAndTextFeaturesScale": "discreteFeatures" and "vecText"
* "discreteOneHotEncoderAndTextFeaturesScale": "discreteOneHotEncoderFeatures" and "vecText"
* "concatStringVec": concatenate "keyword", "location" and "text" (not containing urls), then apply Word2Vec
* "concatStringNoSWVec": concatenate "keyword", "location" and "text" (not containing urls and stopwords), then apply Word2Vec

In [27]:
featuresCols = ["discreteFeaturesScale", "discreteOneHotEncoderFeaturesScale", "textVec", "discreteAndTextFeaturesScale", 
                "discreteOneHotEncoderAndTextFeaturesScale", "concatStringVec", "concatStringNoSWVec"]

algorithmList = {"LR":   LogisticRegression(labelCol=labelCol),
                 "DTC":  DecisionTreeClassifier(labelCol=labelCol),
                 "RFC":  RandomForestClassifier(labelCol=labelCol),
                 "GBTC": GBTClassifier(labelCol=labelCol),
                 "LSVC": LinearSVC(labelCol=labelCol),
                 "LGBMC":LightGBMClassifier(labelCol=labelCol)
                }

from random import randint
for param in featuresCols:
    scores = []
    for name, algorithm in zip(algorithmList.keys(), algorithmList.values()):
        trainSet, validSet = trainDataPreprocessed.randomSplit([0.9, 0.1], seed=randint(11, 2021))
        algorithm.setFeaturesCol(param)
        prediction = algorithm.fit(trainSet).transform(validSet)
        scores.append(evaluator.evaluate(prediction))
    print(f'Param {param:50}: {np.round(np.mean(scores), 5)}')

Param discreteFeaturesScale                             : 0.67728
Param discreteOneHotEncoderFeaturesScale                : 0.6132
Param textVec                                           : 0.71302
Param discreteAndTextFeaturesScale                      : 0.73
Param discreteOneHotEncoderAndTextFeaturesScale         : 0.69689
Param concatStringVec                                   : 0.729
Param concatStringNoSWVec                               : 0.71234


**We observe that "discreteAndTextFeaturesScale" produces the best result.**  
* Using one-hot-encoder harms the performance in this situation.
* Removing stopwords reduces the score, which means that these stopwords can help to decide if a tweet is about a disaster.
* Using more discrete features can help to gain some more points.

### Training on several classification algorithms

In [23]:
trainSet, validSet = trainDataPreprocessed.randomSplit([0.9, 0.1], seed=2021)

We use default hyper-parameters.

In [25]:
featuresCol = "discreteAndTextFeaturesScale"

algorithmList = {"LR":   LogisticRegression(featuresCol=featuresCol, labelCol=labelCol),
                 "DTC":  DecisionTreeClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "RFC":  RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "GBTC": GBTClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "MPC":  MultilayerPerceptronClassifier(featuresCol=featuresCol, labelCol=labelCol, layers=[55,2]),
                 "LSVC": LinearSVC(featuresCol=featuresCol, labelCol=labelCol,),
                 "LGBMC":LightGBMClassifier(featuresCol=featuresCol, labelCol=labelCol)
                }

In [32]:
for name, algorithm in zip(algorithmList.keys(), algorithmList.values()):
    startTime = time.time()
    model = algorithm.fit(trainSet)
    prediction = model.transform(validSet)
    score = evaluator.evaluate(prediction)
    print(f'{name:4}: {np.round(score,5)} in {np.round(time.time() - startTime, 3)}s')

LR  : 0.72925 in 12.207s
DTC : 0.67405 in 11.687s
RFC : 0.6984 in 11.648s
GBTC: 0.73299 in 20.61s
MPC : 0.73037 in 12.813s
LSVC: 0.71278 in 19.878s
LGBMC: 0.76522 in 7.099s


We observe that LightGBMClassifier (default hyper-parameters) produces the highest score.

We train LightGBMClassifier with all data, **the score of test set is 0.74410**.

**Tuning LightGBMClassifier** using TuneHyperparameters  
https://microsoft.github.io/SynapseML/docs/documentation/estimators/estimators_core/

Note that we need to convert our dataframe into a simple datafram with each column representing a feature and one column for target. Other types (e.g. vector, array) can lead to an error.

In [27]:
from synapse.ml.automl import *
from synapse.ml.train import *

from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as F
import re

featuresCol = "discreteAndTextFeaturesScale"

trainSetAllHP = (trainDataPreprocessed.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(55)])

trainSetHP = (trainSet.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(55)])

validSetHP = (validSet.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(55)])

testSetHP = (testDataPreprocessed.withColumn("feature", vector_to_array(featuresCol)))\
.select([F.col("feature")[i] for i in range(55)])


# We remove "[]" in the column names.
trainSetAllHP = trainSetAllHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in trainSetAllHP.columns])
trainSetHP = trainSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in trainSetHP.columns])
validSetHP = validSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in validSetHP.columns])
testSetHP = testSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in testSetHP.columns])

In [28]:
trainSetHP.select("target", "feature0", "feature1").printSchema()

root
 |-- target: integer (nullable = true)
 |-- feature0: double (nullable = true)
 |-- feature1: double (nullable = true)



In [None]:
from synapse.ml.automl import *
from synapse.ml.train import *
import sklearn.metrics as metrics

lgbmc = LightGBMClassifier(boostingType='dart',
                           objective= 'binary',
                           metric= 'auc',
                           isUnbalance= True,
                           numIterations= 300)

smlmodels = [lgbmc]
mmlmodels = [TrainClassifier(model=model, labelCol=labelCol) for model in smlmodels]

paramBuilder = (HyperparamBuilder()
.addHyperparam(lgbmc, lgbmc.learningRate, RangeHyperParam(0.01, 0.5))
.addHyperparam(lgbmc, lgbmc.maxDepth, DiscreteHyperParam([1,30]))
.addHyperparam(lgbmc, lgbmc.numLeaves, DiscreteHyperParam([10,200]))
.addHyperparam(lgbmc, lgbmc.featureFraction, RangeHyperParam(0.1, 1.0))
.addHyperparam(lgbmc, lgbmc.baggingFraction, RangeHyperParam(0.1, 1.0))
.addHyperparam(lgbmc, lgbmc.baggingFreq, RangeHyperParam(0, 3))
)

searchSpace = paramBuilder.build()

randomSpace = RandomSpace(searchSpace)

In [None]:
bestModel = TuneHyperparameters(evaluationMetric="AUC", models=mmlmodels, numFolds=2, 
                                numRuns=len(mmlmodels) * 2, parallelism=1, 
                                paramSpace=randomSpace.space(), seed=0).fit(trainSetHP)

prediction = bestModel.transform(validSetHP)
predLabel = np.array(prediction.select('scored_labels').collect()).squeeze()
trueLabel = np.array(prediction.select('target').collect()).squeeze()
print(metrics.roc_auc_score(trueLabel, predLabel))

**The score of test set is 0.77536.**

## Conclusion

**In this notebook, we showed several steps in processing text and the performance of several models implemented in Pyspark.**