In [0]:
import pyspark.sql.functions as f
pokemon_submissions = spark.read.parquet("/FileStore/pokemon_submissions")

# select top 5 subreddits
top5_subreddit = ['pokemongo','pokemon','pokemontrades','PokemonTCG','PokemonSwordAndShield']
top_pokemon = pokemon_submissions.filter(f.col("subreddit").isin(top5_subreddit)).select("subreddit", "title")

top_pokemon.show(5)

+--------------------+--------------------+
|           subreddit|               title|
+--------------------+--------------------+
|          PokemonTCG|Finally found som...|
|          PokemonTCG|Groudon &amp; Kyo...|
|PokemonSwordAndSh...|NO WAY I WASNT EV...|
|       pokemontrades|    Trade evolutions|
|           pokemongo|Awesome green Shi...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "subreddit", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(top_pokemon)
dataset = pipelineFit.transform(top_pokemon)
dataset.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           subreddit|               title|               words|            filtered|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          PokemonTCG|Finally found som...|[finally, found, ...|[finally, found, ...|(10000,[42,51,85,...|  3.0|
|          PokemonTCG|Groudon &amp; Kyo...|[groudon, amp, ky...|[groudon, kyorge,...|(10000,[733,1372,...|  3.0|
|PokemonSwordAndSh...|NO WAY I WASNT EV...|[no, way, i, wasn...|[no, way, i, wasn...|(10000,[0,3,4,7,3...|  4.0|
|       pokemontrades|    Trade evolutions| [trade, evolutions]| [trade, evolutions]|(10000,[30,323],[...|  2.0|
|           pokemongo|Awesome green Shi...|[awesome, green, ...|[awesome, green, ...|(10000,[1,11,76,8...|  0.0|
+--------------------+--------------------+--------------------+--------------------+-----------

## Load the pretrained Logistic Regression model

In [0]:
from  pyspark.ml.classification import LogisticRegressionModel
loaded_lrModel = LogisticRegressionModel.load('/FileStore/my_folder/fitted_models/lrModel')
lr_predictions = loaded_lrModel.transform(dataset)
lr_predictions.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|           subreddit|               title|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|          PokemonTCG|Finally found som...|[finally, found, ...|[finally, found, ...|(10000,[42,51,85,...|  3.0|[0.08870648145986...|[0.17706882028657...|       3.0|
|          PokemonTCG|Groudon &amp; Kyo...|[groudon, amp, ky...|[groudon, kyorge,...|(10000,[733,1372,...|  3.0|[0.37483773114779...|[0.26426542723431...|       1.0|
|PokemonSwordAndSh...|NO WAY I WASNT EV...|[no, way, i, wasn...|[no, way, i, wasn...|(10000,[0,3,4,7,3...|  4.0|[0.30015614644413...|[0.21716077922039...|       4.0|
|   

### Evaluate the LR model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lrAccuracy = evaluator.evaluate(lr_predictions)

print("Training Accuracy = %g" % lrAccuracy)
print("Training Error = %g" % (1.0 - lrAccuracy))

Training Accuracy = 0.658618
Training Error = 0.341382


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
lrAUC = evaluator.evaluate(predictions)

print("ROC AUC = %g" % lrAUC)

ROC AUC = 0.83574


## Load the pretrained Naive Bayes model

In [0]:
# load model
from pyspark.ml.classification import  NaiveBayesModel
loaded_nb_model = NaiveBayesModel.load("/FileStore/my_folder/fitted_models/nb_model")
nb_predictions = loaded_nb_model.transform(dataset)
nb_predictions.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|           subreddit|               title|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|          PokemonTCG|Finally found som...|[finally, found, ...|[finally, found, ...|(10000,[42,51,85,...|  3.0|[-51.339457489659...|[9.45128868237082...|       3.0|
|          PokemonTCG|Groudon &amp; Kyo...|[groudon, amp, ky...|[groudon, kyorge,...|(10000,[733,1372,...|  3.0|[-43.883580369938...|[0.31266951799120...|       1.0|
|PokemonSwordAndSh...|NO WAY I WASNT EV...|[no, way, i, wasn...|[no, way, i, wasn...|(10000,[0,3,4,7,3...|  4.0|[-101.46975199311...|[2.16690051874203...|       4.0|
|   

### Evaluate the NB model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbAccuracy = evaluator.evaluate(nb_predictions)

print("Training Accuracy = %g" % nbAccuracy)
print("Training Error = %g" % (1.0 - nbAccuracy))

Training Accuracy = 0.65475
Training Error = 0.34525


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
nbAUC = evaluator.evaluate(nb_predictions)

print("ROC AUC = %g" % nbAUC)

ROC AUC = 0.800249
