In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!pip install pyspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.7/dist-packages/pyspark"

In [None]:
import pyspark as ps
from pyspark import SparkContext
from pyspark.sql import SparkSession

try:
    sc.stop()
except:
    print('SparkContext is not created!')

sc = SparkContext(master = "local", appName = "App").getOrCreate()
print(sc, sc.version)

spark = SparkSession.builder.getOrCreate()

# Load data

In [None]:
trainPath = 'nlp-getting-started/train.csv'
trainData = spark.read.format('csv').options(header='true', inferSchema='true').load(trainPath)
trainData.createOrReplaceTempView('trainData')

In [None]:
testPath = 'nlp-getting-started/test.csv'
testData = spark.read.format('csv').options(header='true', inferSchema='true').load(testPath)
testData.createOrReplaceTempView('testData')

# Pipeline

In [None]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml import Pipeline 
import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec, StringIndexer, VectorAssembler, RobustScaler

In [None]:
class FillNanTransformer(Transformer, HasInputCol):
    @keyword_only
    def __init__(self, inputCol=None):
        super(FillNanTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        dataset = dataset.na.fill(value="",subset=[self.getInputCol()])
        return dataset
    
class TextNoLinkTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(TextNoLinkTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        pattern = r'(https?://\S+)'
        dataset = dataset.withColumn(self.getOutputCol(), F.regexp_replace(F.col(self.getInputCol()), pattern, ""))
        return dataset
    
class ContainLinkTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(ContainLinkTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        pattern = r'(https?://\S+)'
        dataset = dataset.withColumn(self.getOutputCol(), F.when(F.col(self.getInputCol()).rlike(pattern),1).otherwise(0))
        return dataset
    
class KeywordLengthTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(KeywordLengthTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        dataset = dataset.withColumn(self.getOutputCol(), F.length(self.getInputCol()))
        return dataset
    
class TextNoLinkLengthTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(TextNoLinkLengthTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        dataset = dataset.withColumn(self.getOutputCol(), F.length(self.getInputCol()))
        return dataset

In [None]:
textFillNanTransformer = FillNanTransformer(inputCol="text")
keywordFillNanTransformer = FillNanTransformer(inputCol="keyword")
locationFillNanTransformer = FillNanTransformer(inputCol="location")

containLinkTransformer = ContainLinkTransformer(inputCol="text", outputCol="containLink")
textNoLinkTransformer = TextNoLinkTransformer(inputCol="text", outputCol="textNoLink")

regexTokenizer = RegexTokenizer(inputCol="textNoLink", outputCol="wordText", pattern="\\W")
stopWordsRemover = StopWordsRemover(inputCol="wordText", outputCol="wordTextNoSW")
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="wordTextNoSW", outputCol="vecText")
keywordIndexer = StringIndexer(inputCol="keyword", outputCol="keywordIndex")
locationIndexer = StringIndexer(inputCol="location", outputCol="locationIndex")

keywordLengthTransformer = KeywordLengthTransformer(inputCol="keyword", outputCol="keywordLength")
textNoLinkLengthTransformer = TextNoLinkLengthTransformer(inputCol="textNoLink", outputCol="textNoLinkLength")

catAssembler = VectorAssembler(inputCols=["keywordIndex", "locationIndex", 
                                        "keywordLength", "textNoLinkLength",
                                        "containLink"], outputCol="catFeatures")

catTextAssembler = VectorAssembler(inputCols=["keywordIndex", "locationIndex", 
                                        "keywordLength", "textNoLinkLength",
                                        "containLink", "vecText"], outputCol="catTextFeatures")

catFeatureRobustScaler = RobustScaler(inputCol="catFeatures", outputCol="catFeaturesRobustScaler",
                                      withScaling=True, withCentering=True,
                                      lower=0.25, upper=0.75)

catTextFeatureRobustScaler = RobustScaler(inputCol="catTextFeatures", outputCol="catTextFeaturesRobustScaler",
                                      withScaling=True, withCentering=True,
                                      lower=0.25, upper=0.75)

In [None]:
preprocessingPipeline = Pipeline(stages=[textFillNanTransformer,
                                keywordFillNanTransformer,
                                locationFillNanTransformer,
                                containLinkTransformer,
                                textNoLinkTransformer,
                                regexTokenizer,
                                stopWordsRemover,
                                word2Vec,
                                keywordIndexer,
                                locationIndexer,
                                keywordLengthTransformer,
                                textNoLinkLengthTransformer,
                                catAssembler,
                                catTextAssembler,
                                catFeatureRobustScaler,
                                catTextFeatureRobustScaler])

# Train

In [None]:
trainingData = spark.sql("SELECT * FROM trainData WHERE target is not NULL")  
testData = spark.sql("SELECT * FROM testData")

In [None]:
trainingDataPreprocessed = preprocessingPipeline.fit(trainingData).transform(trainingData)

In [None]:
trainSet, validSet = trainingDataPreprocessed.randomSplit([0.9, 0.1], seed=12345)

In [None]:
featuresCol = "catTextFeaturesRobustScaler"
labelCol = "target"

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC

evaluator = MulticlassClassificationEvaluator(labelCol=labelCol, predictionCol="prediction", metricName="f1")

algorithmList = {"LR":   LogisticRegression(featuresCol=featuresCol, labelCol=labelCol,regParam = 0.1, maxIter=100),
                 "DTC":  DecisionTreeClassifier(featuresCol=featuresCol, labelCol=labelCol, maxDepth=7),
                 "RFC":  RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol, maxDepth=16, numTrees=15),
                 "GBTC": GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, maxIter=20, maxDepth=16, stepSize=0.001),
                 #"MPC":  MultilayerPerceptronClassifier(featuresCol=featuresCol, labelCol=labelCol, maxIter=2, layers=[5, 5, 2]),
                 "LSVC": LinearSVC(featuresCol=featuresCol, labelCol=labelCol, maxIter=100, regParam=0.1)
                }

In [None]:
for name, algorithm in zip(algorithmList.keys(), algorithmList.values()):
    model = algorithm.fit(trainSet)
    prediction = model.transform(validSet)
    score = evaluator.evaluate(prediction)
    print(name, score)

In [None]:
algorithmName = "GBTC"
algorithm = algorithmList[algorithmName]
prediction = algorithm.fit(trainSet).transform(validSet)