In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pyspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.7/dist-packages/pyspark"

In [None]:
import time
import numpy as np

In [None]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.azure:synapseml:0.9.1") \
            .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
            .getOrCreate()

print(spark)

In [None]:
trainPath = "/content/drive/MyDrive/Colab Notebooks/Projects/kaggle/NLP_with_Disaster_Tweets/nlp-getting-started/train.csv"
trainData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(trainPath)

In [None]:
testPath = "/content/drive/MyDrive/Colab\ Notebooks/Projects/kaggle/NLP_with_Disaster_Tweets/nlp-getting-started/test.csv"
testData = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load(testPath)

In [None]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  
from pyspark.ml import Pipeline 
import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec, StringIndexer,OneHotEncoder, VectorAssembler, RobustScaler

class FillNanTransformer(Transformer, HasInputCols, DefaultParamsReadable, DefaultParamsWritable):
    nanReplacement = Param(Params._dummy(), "nanReplacement", "nanReplacement", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCols=None, nanReplacement=None):
        super(FillNanTransformer, self).__init__()
        self._setDefault(nanReplacement="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCols=None, nanReplacement=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def getNanReplacement(self):
        return self.getOrDefault(self.nanReplacement)
    
    def _transform(self, dataset):
        nanReplacement = self.getNanReplacement()
        dataset = dataset.na.fill(value=nanReplacement,subset=self.getInputCols())
        return dataset
    
    
class RemovePatternTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    pattern = Param(Params._dummy(), "pattern", "pattern", typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, pattern=None):
        super(RemovePatternTransformer, self).__init__()
        self._setDefault(pattern="")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, pattern=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def getPattern(self):
        return self.getOrDefault(self.pattern)
    
    def _transform(self, dataset):
        pattern = self.getPattern()
        dataset = dataset.withColumn(self.getOutputCol(), F.regexp_replace(F.col(self.getInputCol()), pattern, ""))
        return dataset
    
removeUrlTransformer = RemovePatternTransformer(inputCol="text", outputCol="textNoUrl", pattern="(https?://\S+)")

regexTokenizer = RegexTokenizer(inputCol="textNoUrl", outputCol="textArrayWord", pattern="\\W")

stopWordsRemover = StopWordsRemover(inputCol="textArrayWord", outputCol="textNoSW")
word2Vec = Word2Vec(vectorSize=50, windowSize=10, minCount=0, inputCol="textNoSW", outputCol="textVec")

preprocessingPipeline = Pipeline(stages=[removeUrlTransformer, 
                                         regexTokenizer, stopWordsRemover, word2Vec 
                                        ])

In [None]:
preprocessingModel = preprocessingPipeline.fit(trainData)

trainDataPreprocessed = preprocessingModel.transform(trainData)
testDataPreprocessed = preprocessingModel.transform(testData)

In [None]:
trainSet, validSet = trainDataPreprocessed.randomSplit([0.9, 0.1], seed=12345)

In [None]:
labelCol = "target"
featuresCol = "textVec"

lgbmc = LightGBMClassifier(boostingType='dart',
                           objective= 'binary',
                           metric= 'auc',
                           isUnbalance= True,
                           numIterations= 300,
                           labelCol="target",
                           featuresCol="textVec")

prediction = lgbmc.fit(trainSet).transform(validSet)
print(evaluator.evaluate(prediction))

In [None]:
from synapse.ml.automl import *
from synapse.ml.train import *

from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as F
import re

trainSetAllHP = (trainDataPreprocessed.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(50)])

trainSetHP = (trainSet.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(50)])

validSetHP = (validSet.withColumn("feature", vector_to_array(featuresCol)))\
.select([labelCol]+ [F.col("feature")[i] for i in range(50)])

testSetHP = (testDataPreprocessed.withColumn("feature", vector_to_array(featuresCol)))\
.select([F.col("feature")[i] for i in range(50)])


# We remove "[]" in the column names.
trainSetAllHP = trainSetAllHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in trainSetAllHP.columns])
trainSetHP = trainSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in trainSetHP.columns])
validSetHP = validSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in validSetHP.columns])
testSetHP = testSetHP.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+","",col)) for col in testSetHP.columns])

In [None]:
from synapse.ml.automl import *
from synapse.ml.train import *
import sklearn.metrics as metrics

lgbmc = LightGBMClassifier(boostingType='dart',
                           objective= 'binary',
                           metric= 'auc',
                           isUnbalance= True,
                           numIterations= 300)

smlmodels = [lgbmc]
mmlmodels = [TrainClassifier(model=model, labelCol=labelCol) for model in smlmodels]

paramBuilder = (HyperparamBuilder()
.addHyperparam(lgbmc, lgbmc.learningRate, RangeHyperParam(0.01, 0.5))
.addHyperparam(lgbmc, lgbmc.maxDepth, DiscreteHyperParam([1,30]))
.addHyperparam(lgbmc, lgbmc.numLeaves, DiscreteHyperParam([10,200]))
.addHyperparam(lgbmc, lgbmc.featureFraction, RangeHyperParam(0.1, 1.0))
.addHyperparam(lgbmc, lgbmc.baggingFraction, RangeHyperParam(0.1, 1.0))
.addHyperparam(lgbmc, lgbmc.baggingFreq, RangeHyperParam(0, 3))
)

searchSpace = paramBuilder.build()

randomSpace = RandomSpace(searchSpace)

In [None]:
bestModel = TuneHyperparameters(evaluationMetric="AUC", models=mmlmodels, numFolds=2, 
                                numRuns=len(mmlmodels) * 2, parallelism=1, 
                                paramSpace=randomSpace.space(), seed=0).fit(trainSetHP)

prediction = bestModel.transform(validSetHP)
predLabel = np.array(prediction.select('scored_labels').collect()).squeeze()
trueLabel = np.array(prediction.select('target').collect()).squeeze()
print(metrics.roc_auc_score(trueLabel, predLabel))

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
    name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions submit -c nlp-getting-started -f submission.csv -m "LightGbmTunning"