# Pyspark for Natural Language Processing with Disaster Tweets

https://www.kaggle.com/c/nlp-getting-started

In [44]:
import time
import numpy as np

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(master = "local", appName = "App").getOrCreate()
spark = SparkSession.builder.getOrCreate()

print(sc, sc.version, spark)

In [3]:
trainPath = 'nlp-getting-started/train.csv'
testPath = 'nlp-getting-started/test.csv'

trainData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(trainPath)
testData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(testPath)

print('Number of row in Training:', trainData.count())
print('Number of row in Test:    ', testData.count())

Number of row in Training: 7613
Number of row in Test:     3263


## Preprocessing

In [5]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  
from pyspark.ml import Pipeline 
import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec, StringIndexer,OneHotEncoder, VectorAssembler, RobustScaler

### Processing Null

We create a custom transformer to replace Null. In the case of "keyword" and "location", we replace null value by a symbole e.g. "\$" not an empty string "" since OneHotEncoder has an error with an empty string.

In [6]:
class FillNanTransformer(Transformer, HasInputCols, DefaultParamsReadable, DefaultParamsWritable):
    nanReplacement = Param(Params._dummy(), "nanReplacement", "nanReplacement", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCols=None, nanReplacement=None):
        super(FillNanTransformer, self).__init__()
        self._setDefault(nanReplacement="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, nanReplacement=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def getNanReplacement(self):
        return self.getOrDefault(self.nanReplacement)
    
    def _transform(self, dataset):
        nanReplacement = self.getNanReplacement()
        dataset = dataset.na.fill(value=nanReplacement,subset=self.getInputCols())
        return dataset

In [7]:
fillNanTransformer = FillNanTransformer(inputCols=["keyword", "location"], nanReplacement="$")
textFillNanTransformer = FillNanTransformer(inputCols=["text"], nanReplacement="")

### Processing urls in "text"

We remove urls from "text" and create a new column to verify if a tex contains an url.

In [8]:
class RemovePatternTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    pattern = Param(Params._dummy(), "pattern", "pattern", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, pattern=None):
        super(RemovePatternTransformer, self).__init__()
        self._setDefault(pattern="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, pattern=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def getPattern(self):
        return self.getOrDefault(self.pattern)
    
    def _transform(self, dataset):
        pattern = self.getPattern()
        dataset = dataset.withColumn(self.getOutputCol(), F.regexp_replace(F.col(self.getInputCol()), pattern, ""))
        return dataset
    
class CheckPatternTransformer(Transformer, HasInputCol, HasOutputCol):
    pattern = Param(Params._dummy(), "pattern", "pattern", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, pattern=None):
        super(CheckPatternTransformer, self).__init__()
        self._setDefault(pattern="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, pattern=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def getPattern(self):
        return self.getOrDefault(self.pattern)
    
    def _transform(self, dataset):
        pattern = self.getPattern()
        dataset = dataset.withColumn(self.getOutputCol(), F.when(F.col(self.getInputCol()).rlike(pattern),1.).otherwise(0.))
        return dataset

In [9]:
removeUrlTransformer = RemovePatternTransformer(inputCol="text", outputCol="textNoUrl", pattern="(https?://\S+)")
checkUrlTransformer = CheckPatternTransformer(inputCol="text", outputCol="textIsContainedUrl", pattern="(https?://\S+)")

### Get lengths for "keyword" and "text" (without urls)

In [11]:
class GetLengthTransformer(Transformer, HasInputCols, HasOutputCols):
    @keyword_only
    def __init__(self, inputCols=None, outputCols=None):
        super(GetLengthTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, outputCols=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        for inputCol, outputCol in zip(self.getInputCols(), self.getOutputCols()):
            dataset = dataset.withColumn(outputCol, F.length(inputCol))
        return dataset

In [12]:
getLengthTransformer = GetLengthTransformer(inputCols=["keyword","textNoUrl"], outputCols=["keywordLen", "textNoUrlLen"])

### Processing discrete features: "keyword", "location" and length features

Indexing "keyword" and "location"

In [14]:
keywordIndexer = StringIndexer(inputCol="keyword", outputCol="keywordIndex", handleInvalid="keep")
locationIndexer = StringIndexer(inputCol="location", outputCol="locationIndex", handleInvalid="keep")

In [15]:
oneHotEncoder = OneHotEncoder(inputCols=["keywordIndex", "locationIndex", "textIsContainedUrl"],
                              outputCols=["keywordVec", "locationVec", "textIsContainedUrlVec"],
                              handleInvalid="keep")

### Processing "text" (without urls)

We remove special characters and stopwords, then use word2vec to obtain a vector of "textNoUrl"

In [29]:
regexTokenizer = RegexTokenizer(inputCol="textNoUrl", outputCol="textArrayWord", pattern="\\W")

stopWordsRemover = StopWordsRemover(inputCol="textArrayWord", outputCol="textNoSW")
word2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="textNoSW", outputCol="textVec")

We concatenate strings in "keyword", "location", "textNoUrl" and then apply the same procedure as for "textNoUrl"

In [19]:
class ConcatenateTransformer(Transformer, HasInputCols, HasOutputCol):
    @keyword_only
    def __init__(self, inputCols=None, outputCol=None):
        super(ConcatenateTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        dataset = dataset.withColumn(self.getOutputCol(), F.col(self.getInputCols()[0]))
        for colName in self.getInputCols()[1:]:
            dataset = dataset.withColumn(self.getOutputCol(), 
                F.concat_ws('@', F.col(self.getOutputCol()), F.col(colName)))
        return dataset

In [20]:
concatStringTransformer = ConcatenateTransformer(inputCols=["keyword", "location", "textNoUrl"], outputCol="concatString")
concatStringRegexTokenizer = RegexTokenizer(inputCol="concatString", outputCol="concatStringArrayWord", pattern="\\W")
concatStringStopWordsRemover = StopWordsRemover(inputCol="concatStringArrayWord", outputCol="concatStringArrayWordNoSW")

We would like observe effects of removing stopwords by generating two vectors "concatStringArrayWord" containing stopwords and "concatStringArrayWordNoSW" without stopwords

In [21]:
concatStringWord2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="concatStringArrayWord", outputCol="concatStringVec")
concatStringNoSWWord2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="concatStringArrayWordNoSW", outputCol="concatStringNoSWVec")

### Combining several features

We would like to observe how these feature sets affect the performance. Note that we scale these feature sets by using RobustScaler.

**"discreteFeatures"**: we only use discrete features

In [25]:
discreteFeaturesAssembler = VectorAssembler(inputCols=["keywordVec", "locationVec", "textIsContainedUrlVec",
                                                      "keywordLen", "textNoUrlLen"], 
                                            outputCol="discreteFeatures")


discreteFeaturesRobustScaler = RobustScaler(inputCol="discreteFeatures", outputCol="discreteFeaturesScale",
                                            withScaling=True, withCentering=True, lower=0.25, upper=0.75)

**"discreteAndTextFeatures"**: we add features of text in "discreteFeatures". Note that this could decay the effects of these discrete features.

In [26]:
discreteAndTextFeaturesAssembler = VectorAssembler(inputCols=["discreteFeatures", "textVec"],
                                                   outputCol="discreteAndTextFeatures")

discreteAndTextFeaturesRobustScaler = RobustScaler(inputCol="discreteAndTextFeatures", outputCol="discreteAndTextFeaturesScale",
                                                   withScaling=True, withCentering=True, lower=0.25, upper=0.75)

### Combining all preprocessing stages

In [30]:
preprocessingPipeline = Pipeline(stages=[fillNanTransformer, textFillNanTransformer,
                                         removeUrlTransformer, regexTokenizer, stopWordsRemover, word2Vec,
                                         keywordIndexer, locationIndexer, checkUrlTransformer, getLengthTransformer, oneHotEncoder,
                                         concatStringTransformer, concatStringRegexTokenizer, concatStringStopWordsRemover, 
                                         concatStringWord2Vec, concatStringNoSWWord2Vec,
                                         discreteFeaturesAssembler, 
                                         discreteAndTextFeaturesAssembler,
                                         discreteFeaturesRobustScaler, discreteAndTextFeaturesRobustScaler
                                        ])

## Training

In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC

evaluator = BinaryClassificationEvaluator(labelCol=labelCol, rawPredictionCol="prediction", metricName="areaUnderROC")
labelCol = "target"

In [31]:
preprocessingModel = preprocessingPipeline.fit(trainData)

trainDataPreprocessed = preprocessingModel.transform(trainData)
testDataPreprocessed = preprocessingModel.transform(testData)

In [32]:
trainSet, validSet = trainDataPreprocessed.randomSplit([0.9, 0.1], seed=12345)

### Several observations on our feature sets

**Important features by using RandomForestClassifier**

In [40]:
featuresCol = "discreteIndexFeaturesScale"

discreteIndexFeaturesAssembler = VectorAssembler(inputCols=["keywordIndex", "locationIndex", "textIsContainedUrl",
                                                            "keywordLen", "textNoUrlLen"], 
                                                 outputCol="discreteIndexFeatures")

discreteIndexFeaturesRobustScaler = RobustScaler(inputCol="discreteIndexFeatures", outputCol="discreteIndexFeaturesScale",
                                            withScaling=True, withCentering=True, lower=0.25, upper=0.75)

checkSet = discreteIndexFeaturesAssembler.transform(trainSet)
checkSet = discreteIndexFeaturesRobustScaler.fit(checkSet).transform(checkSet)

featuresImportanceModel = RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol).fit(checkSet)

for column in zip(discreteIndexFeaturesAssembler.getInputCols(), list(featuresImportanceModel.featureImportances)):
     print(f"{column[0]:20}: {column[1]}")

keywordIndex        : 0.1870955999292841
locationIndex       : 0.01481530355956237
textIsContainedUrl  : 0.40378388888329625
keywordLen          : 0.2682762143879448
textNoUrlLen        : 0.12602899323991243


We see that containing an url in "text" has the highest impact on the target.

**We observe scores based on several feature sets:**
* "discreteFeaturesScale": "keywordIndex", "locationIndex", "keywordLength", "textNoLinkLength", "containLink"
* "discreteAndTextFeaturesScale": "discreteFeaturesScale" and "vecText"
* "vecText" only
* "concatStringVec": concatenate "keyword", "location" and "text" (not containing urls), then apply Word2Vec
* "concatStringNoSWVec": concatenate "keyword", "location" and "text" (not containing urls and stopwords), then apply Word2Vec

In [49]:
featuresCols = ["discreteFeaturesScale", "discreteAndTextFeaturesScale", 
                "textVec", "concatStringVec", "concatStringNoSWVec"]

algorithmList = {"LR":   LogisticRegression(featuresCol=featuresCol, labelCol=labelCol),
                 "DTC":  DecisionTreeClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "RFC":  RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "GBTC": GBTClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "LSVC": LinearSVC(featuresCol=featuresCol, labelCol=labelCol,)
                }

for param in featuresCols:
    scores = []
    for name, algorithm in zip(algorithmList.keys(), algorithmList.values()):
        algorithm.setFeaturesCol(param)
        prediction = algorithm.fit(trainSet).transform(validSet)
        scores.append(evaluator.evaluate(prediction))
    print(f'Param {param:50}: {np.round(np.mean(scores), 5)}')

Param discreteFeaturesScale                             : 0.60127
Param discreteAndTextFeaturesScale                      : 0.676
Param textVec                                           : 0.68316
Param concatStringVec                                   : 0.69456
Param concatStringNoSWVec                               : 0.70012


**We observe that "concatStringNoSWVec" (concatenating strings in "keyword", "location" and "text" without urls and stopwords) produces the best result**.  
Note that ranks of the feature sets (discreteAndTextFeaturesScale, textVec, concatStringVec) for each model can be different.

### Training on several classification algorithms

We use default hyper-parameters.

In [53]:
featuresCol = "concatStringNoSWVec"

algorithmList = {"LR":   LogisticRegression(featuresCol=featuresCol, labelCol=labelCol),
                 "DTC":  DecisionTreeClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "RFC":  RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "GBTC": GBTClassifier(featuresCol=featuresCol, labelCol=labelCol),
                 "MPC":  MultilayerPerceptronClassifier(featuresCol=featuresCol, labelCol=labelCol, layers=[50,2]),
                 "LSVC": LinearSVC(featuresCol=featuresCol, labelCol=labelCol,)
                }

In [54]:
for name, algorithm in zip(algorithmList.keys(), algorithmList.values()):
    startTime = time.time()
    model = algorithm.fit(trainSet)
    prediction = model.transform(validSet)
    score = evaluator.evaluate(prediction)
    print(f'{name:4}: {np.round(score,5)} in {np.round(time.time() - startTime, 3)}s')

LR  : 0.69614 in 12.368s
DTC : 0.69632 in 11.587s
RFC : 0.70923 in 11.838s
GBTC: 0.7139 in 18.22s
MPC : 0.68951 in 12.233s
LSVC: 0.68502 in 20.369s


We observe that GBTClassifier (default hyper-parameters) produces the highest score.  
We train GBTClassifier with all data, **the score of test set is 0.72939**.

**Cross-validation**

In [55]:
algorithmName = "GBTC"
algorithm = algorithmList[algorithmName]
predictionTest = algorithm.fit(trainDataPreprocessed).transform(testDataPreprocessed)
predLabelTest = np.array(predictionTest.select('prediction').collect()).squeeze()

In [57]:
import pandas as pd
submission = pd.read_csv('nlp-getting-started/sample_submission.csv')
submission['target'] = submission['target'] + predLabelTest.astype(int)
submission.to_csv('submission.csv', index=False)

In [58]:
!kaggle competitions submit -c nlp-getting-started -f submission.csv -m "LightGbm-Concat"

100%|██████████████████████████████████████| 22.2k/22.2k [00:03<00:00, 6.33kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets