In [14]:
%%time
import pandas as pd
import os, sys, time, json, re, string

from pyspark import SparkContext, SparkConf, StorageLevel, keyword_only

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.param.shared import HasInputCol, HasInputCols, HasOutputCol, HasOutputCols, Param
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, NGram, CountVectorizer, StopWordsRemover
from pyspark.ml.feature import VectorAssembler, PCA

from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml import Pipeline, Transformer

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from platform import python_version
print(python_version())

3.6.10
CPU times: user 0 ns, sys: 739 µs, total: 739 µs
Wall time: 553 µs


In [2]:
%%time
sc

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.54 µs


In [3]:
%%time
spark = SparkSession.builder \
        .appName("fakenews") \
        .config("spark.master", "yarn") \
        .config("spark.submit.deployMode", "cluster") \
        .config("spark.driver.memory", "25g") \
        .config("spark.executor.instances", "5") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.memory", "25g") \
        .getOrCreate()

CPU times: user 2.65 ms, sys: 3.85 ms, total: 6.5 ms
Wall time: 9.93 ms


In [4]:
%%time
fakenews_path="gs://dataproc-6ca41800-27b4-47d5-abee-55c011dfa389-asia-southeast1/data/fake-news/"
fakenews_data_path = fakenews_path + "two_million_rows_news_cleaned_2018_02_13_pyspark.csv"
df_news = spark.read.format("com.databricks.spark.csv") \
                    .option("header", "true") \
                    .option("delimiter", '#') \
                    .load(fakenews_data_path)

# only keep type and content
df_news = df_news.select("type", "content", "domain")
# add binary label
df_news = df_news.withColumn("label", F.when(F.col("type") == 'fake', 1).otherwise(0))

#remove empty content which will cause problem when transform the text
df_news = df_news.filter(df_news.content != "")

df_news_fake = df_news.filter(df_news.type == 'fake')
df_news_nonfake = df_news.filter(df_news.type != 'fake')
df_news = df_news_fake.union(df_news_nonfake)

# split the dataset
df_train, df_test = df_news.randomSplit([0.8, 0.2], seed=666)

param_tuning = False

CPU times: user 11 ms, sys: 4.39 ms, total: 15.4 ms
Wall time: 5.74 s


In [15]:
%%time
# customized transformer class to manually extract some counting based text features
class ReviewContentTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol="content", outputCol="content_features"):
        super(ReviewContentTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)


    def _transform(self, dataset):
        
        def f(s):
            uppercase_count = 0
            char_count = 0
            for c in s:                
                if c in string.ascii_uppercase:
                    uppercase_count += 1
                    char_count += 1
                elif c in string.ascii_lowercase:
                    char_count += 1
            
            text_len = len(s)
            return Vectors.dense(text_len, char_count, 
                                 uppercase_count, uppercase_count / (char_count + 1e-10))

        return dataset.withColumn(self.getOutputCol(), 
                                  F.udf(f, VectorUDT())(dataset[self.getInputCol()]))

CPU times: user 76 µs, sys: 12 µs, total: 88 µs
Wall time: 93 µs


In [16]:
%%time
# customized transformer class to manually extract some counting based word features
class ReviewWordsTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol="content", outputCol="content_features"):
        super(ReviewWordsTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)


    def _transform(self, dataset):
        
        def f(words):    
            word_count = len(words)
            unique_word_count = len(set(words))
            upper_words = []
            for w in words:
                if w.isupper():
                    upper_words.append(w)
            upper_word_count = len(set(upper_words))
            unique_upper_word_count = len(upper_words)
            return Vectors.dense(word_count, unique_word_count, unique_word_count / (word_count + 1e-10),
                                 upper_word_count, upper_word_count / (word_count + 1e-10), 
                                 unique_upper_word_count, unique_upper_word_count / (upper_word_count + 1e-10))

        return dataset.withColumn(self.getOutputCol(), 
                                  F.udf(f, VectorUDT())(dataset[self.getInputCol()]))

CPU times: user 72 µs, sys: 12 µs, total: 84 µs
Wall time: 88.2 µs


In [7]:
%%time
# show model prediction performance on the given dataset
def eval_model_perf(fitted_model, dataset, label_col="label", prediction_col="prediction", probability_col="probability"):
    pred_dataset = fitted_model.transform(dataset)
    eval_dataset = pred_dataset.select(label_col, prediction_col, probability_col)
    # model performance evaluation
    metricNames = ["accuracy", "f1"]
    model_eval = MulticlassClassificationEvaluator(predictionCol=prediction_col, labelCol=label_col)
    for m in metricNames:
        val = model_eval.evaluate(eval_dataset, {model_eval.metricName: m})
        print(m, " = ", val)
    roc_eval = BinaryClassificationEvaluator(rawPredictionCol=probability_col, labelCol=label_col, metricName="areaUnderROC")
    print("AUC =", roc_eval.evaluate(eval_dataset))    
    return pred_dataset

# show CV param tunning result
def show_cv_results(cv_model):
    for result, param in sorted(zip(cv_model.avgMetrics, cv_model.getEstimatorParamMaps()), reverse=True, key=lambda x: x[0]):
        print(result, " | ", param)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [8]:
%%time
def run_models(df_train, df_test, without_pca = False):
    print("**********LogisticRegression**********")
    t = time.time()
    lr_model = LogisticRegression(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction',
                                  family='binomial', 
                                  fitIntercept=True, 
                                  threshold=0.5, 
                                  standardization=False, 
                                  maxIter=200, 
                                  regParam=0.005, 
                                  elasticNetParam=0, 
                                  tol=1e-06, 
                                  aggregationDepth=2)

    lr_model = lr_model.fit(df_train)
    
    eval_model_perf(lr_model, df_test)
    
    print("time taken for LogisticRegression: ", time.time() - t)
    t = time.time()

    print("**********DecisionTreeClassifier**********")
    dt_model = DecisionTreeClassifier(featuresCol='features', 
                                      labelCol='label', 
                                      predictionCol='prediction', 
                                      probabilityCol='probability', 
                                      rawPredictionCol='rawPrediction', 
                                      maxDepth=10, maxBins=32, 
                                      minInstancesPerNode=1, 
                                      minInfoGain=0.0, 
                                      maxMemoryInMB=2048, 
                                      cacheNodeIds=True, 
                                      checkpointInterval=10,
                                      impurity='gini', 
                                      seed=666)

    dt_model = dt_model.fit(df_train)
    eval_model_perf(dt_model, df_test)
    print("time taken for DecisionTreeClassifier: ", time.time() - t)
    t = time.time()
    
    print("**********RandomForestClassifier**********")
    rf_model = RandomForestClassifier(featuresCol='features', 
                                      labelCol='label', 
                                      predictionCol='prediction', 
                                      probabilityCol='probability', 
                                      rawPredictionCol='rawPrediction',
                                      maxDepth=10, 
                                      maxBins=32, 
                                      minInstancesPerNode=1, 
                                      minInfoGain=0.0, 
                                      maxMemoryInMB=2048, 
                                      cacheNodeIds=True, 
                                      checkpointInterval=10, 
                                      impurity='gini', 
                                      numTrees=200, 
                                      featureSubsetStrategy='auto', 
                                      seed=666, 
                                      subsamplingRate=0.8)

    rf_model = rf_model.fit(df_train)
    eval_model_perf(rf_model, df_test)
    print("time taken for RandomForestClassifier: ", time.time() - t)
    t = time.time()
    
    print("**********GBTClassifier**********")
    gbt_model = GBTClassifier(featuresCol='features', 
                             labelCol='label', 
                             maxIter=250)

    gbt_model = gbt_model.fit(df_train)
    eval_model_perf(gbt_model, df_test)
    print("time taken for GBTClassifier: ", time.time() - t)
    t = time.time()    
    
    print("**********MultilayerPerceptronClassifier**********")
    mp_model = MultilayerPerceptronClassifier(featuresCol='features', 
                                              labelCol='label', 
                                              predictionCol='prediction', 
                                              layers=[4, 5, 4, 3],  
                                              maxIter=100, 
                                              blockSize=128, 
                                              seed=1234)

    mp_model = mp_model.fit(df_train)
    eval_model_perf(mp_model, df_test)
    print("time taken for MultilayerPerceptronClassifier: ", time.time() - t)
    t = time.time() 
    
    if without_pca:
        print("**********NaiveBayes**********")
        nb_model = NaiveBayes(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction', 
                              smoothing=1, 
                              modelType='multinomial')

        nb_model = mp_model.fit(df_train)
        eval_model_perf(nb_model, df_test)
        print("time taken for NaiveBayes: ", time.time() - t)    

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 11.9 µs


In [9]:
%%time
def build_data_preproc_model_with_pca(vocab_size=5000):
    preproc_steps = [
        RegexTokenizer(inputCol="content", outputCol="all_words", pattern=r"\W"),
        StopWordsRemover(inputCol="all_words", outputCol="words"),
        CountVectorizer(inputCol="words", outputCol="tf_features", vocabSize=vocab_size),
        IDF(inputCol="tf_features", outputCol="tfidf_features"),
        PCA(inputCol="tfidf_features", outputCol="pca_features", k=100),
        ReviewContentTransformer(inputCol="content", outputCol="content_features"),
        ReviewWordsTransformer(inputCol="words", outputCol="word_features"),
        VectorAssembler(inputCols=["pca_features", "content_features", "word_features"], 
                        outputCol="features")
    ]
    return Pipeline(stages=preproc_steps)

def build_data_preproc_model_without_pca(vocab_size=5000):
    preproc_steps = [
        RegexTokenizer(inputCol="content", outputCol="all_words", pattern=r"\W"),
        StopWordsRemover(inputCol="all_words", outputCol="words"),
        CountVectorizer(inputCol="words", outputCol="tf_features", vocabSize=vocab_size),
        IDF(inputCol="tf_features", outputCol="tfidf_features"),
        ReviewContentTransformer(inputCol="content", outputCol="content_features"),
        ReviewWordsTransformer(inputCol="words", outputCol="word_features"),
        VectorAssembler(inputCols=["tf_features", "content_features", "word_features"], outputCol="features")
    ]
    return Pipeline(stages=preproc_steps)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs


In [None]:
%%time
print("**********Run Models with PCA Features**********")
# generate the features to be used for model training
preproc_model = build_data_preproc_model_with_pca(2000).fit(df_train)
df_train_pca = preproc_model.transform(df_train).select("label", "features")
df_test_pca = preproc_model.transform(df_test).select("label", "features")

**********Run Models with PCA Features**********


Py4JJavaError: An error occurred while calling o577.transform.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31.0 failed 4 times, most recent failure: Lost task 0.3 in stage 31.0 (TID 914, cluster-8866-w-4.asia-southeast1-b.c.weicheng.internal, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 83, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 685, in toInternal
    return self._cachedSqlType().toInternal(self.serialize(obj))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/linalg/__init__.py", line 167, in serialize
    raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
TypeError: cannot serialize Column<b'f(            Hamish  a contributor  remarks on how we are encouraged to examine what is in our food but criminalized for rejecting the cocktail of toxins contained in vaccines                       By Hamish   henrymakow com       The Establishment media has reported that there is a growing  anti vax  movement such an uptick in opposition to vaccines that the moniker  anti vaxxers  has entered the modern lexicon       In Ontario  school boards have warned parents  to update their kids  immunization records or face suspension   The provincial legislature has produced legislation that  would require anti vax parents to take an  education session  essentially a re education brainwash  The authoritarian posturing by the Establishment produces a societal chilling effect  people who either refuse vaccines or denounce them are ostracized by society and this kind of systematic shunning is a worldwide trend       In Australia  there is a  vaccine crackdown  announcing an end to religious exemptions and decreeing that  doctors will be given incentive payments so that parents stick to their children s vaccination schedule   In poorer countries  organizations like The Bill and Melinda Gates Foundation  the world s largest private foundation with assets in the billions  distributes vaccines to millions  Adverse consequences are rarely headlined       Major media outlets censure any expos\xc3\xa9 exploring the possibility of vaccine damage and corrupt Establishment collusion like the film Vaxxed  written and directed by Dr  Andrew Wakefield  that was pulled from the Tribeca Film Festival  Celebrity Hollywood stars are scorned like Jim Carrey and Jenny McCarthy for expressing concerns about vaccine safety  Even Donald Trump has stated that vaccines are  not helpful  and in turn has been ridiculed by the corporate media  Of course  Dr  Oz  the darling of that same corporate media  shills for the Establishment and peddles vaccine injections for the public rejecting any trepidation about the jabs       So  what can we make of this controversy  It is a fact that vaccine pamphlet inserts state that their product can cause a litany of side effects  including autism despite what CNN s Dr  Gupta says  I propose that when we look at vaccines we should use the same approach most people take when they decide what food and drink enters their body  After all  an injection enables many ingredients to enter the body and yet  for the most part  people take vaccines on blind faith  They have no idea what s in the shot and the  faithfully blind  usually don t even notice when later it s admitted a particular vaccine isn t effective  From this angle  I present some observations made by author Liam Scheff in his book Official Stories  2012    WHAT IS IN A VACCINE   The term  vaccination  comes from the Latin word  vacca   We could call it cow injection  to be true to history  because  and we re never really told this  but the vaccine that made it all famous smallpox came from the sores on the underbellies and legs of cows and horses  Pus and blood were scraped off  put on the ends of small  sharp pronged forks or lancets and jabbed into people s arms  Yes   vacca  means  cow   Does that surprise you  Did we really think that animal blood and pus was a good thing to put into our bodies  Vaccines are regarded as a nearly magical process  like a totem or a crucible  a station of the cross in the Western world  it has replaced baptism as a holy rite  Those who are opposed are mistrusted and feared  almost as witches  certainly as troubled heretics  But no one ever asks the question  what is in a vaccine       Vaccines are not conjured at Hogwarts by honest wizards  Willy Wonka doesn t brew them in his chocolate factory  They are not magical and there is a reason  or many  why some people oppose them so strongly  Vaccines are toxic  by their very nature  The liquid in the syringe is filled with very small pieces of well  a lot of things  These materials come from laboratory dishes where putative viruses are grown  But nothing biological can be grown  except in a  medium  or substrate  That is  it takes living tissue to grow living microscopic entities  So  what tissues are vaccines grown in  or really  culled from  It s a living cell matrix  The first substrates were a variety of animal body parts  including spines and brains  rabbits were often used  Sometimes it was pus and blood from a sick animal  Then it was monkey kidneys and testicles  that s what the putative polio virus was grown in  Of course  monkey cells contain monkey proteins  viruses  bacteria  mycoplasmas  and toxins  It is not possible to filter out one microscopic particle from a sea of similarly sized or smaller particles  These particles  proteins  viruses  and cellular debris have been and are being injected into millions of people  in the name of stopping polio and every other disease for which there is a vaccine       Hamster ovaries  washed sheep blood  dog kidney cells and here s a favorite with the Christian crowd aborted human fetal tissue  these are newer substrates  These cells are cultured  fed  stimulated and made to replicate  But you can t just siphon off fluid and cellular detritus from animal cells you have to mix it with chemicals  to insure that it s stable  that it doesn t quite rot and that it truly inflames and agitates the immune system  In addition to the living tissue  vaccines have added to them a series of metals and preservatives  as well as chemical agents sent to aggravate and disturb your cells  Mercury is one of the longest used metals in vaccines  It is a deadly neurotoxin  that on its own causes paralysis and death  as has been known since the time of the ancient Greeks  as recorded in Hippocrates  They also add aluminum  sucrose  MSG  phenol  aspartame and more  Formaldehyde has made it into countless batches  Formaldehyde is used to embalm dead people to keep them from rotting  Squalene  the infamous adjuvant linked to Gulf War Illness  is a more recent offering  Its job is to agitate your muscles  blood vessels  cells and tissue into an inflamed state  Vaccine manufacturers actively seek this inflammatory response  They feel it helps their vaccine work  But it can also bring on real illness  pain  nausea  cramps  fainting  tremors  seizures  and a long list of neurological response  Sometimes vaccines cause death  sometimes instantly  Yes  that has happened too  Is that good for children  No  it s a toxic poison  But there it goes  into the blood       CONCLUSION  It s increasingly popular to be aware of what food ingredients are entering our bodies look at the backlash against GMOs or the chemical azodicarbonamide  removed from Subway restaurant s sandwich bread after it was exposed that the chemical is also used to manufacture yoga mats and shoe soles   But if you mention any reservations about vaccine ingredients to the average person then they ll likely give you a blank stare in response that s the  blind faith  programming kicking in       Nevertheless  one thing is for certain  vaccine ingredients are toxic and inoculations actually spread viruses  so those concerns should be at least on par with our particular dietary intake qualms  Perhaps  in addition to an apprehensive stance on vaccination regarding the ingredients used  we should also take heed of the messages from Establishment sources on their peculiar affinity for the actual viruses especially for uses beyond the so called public benefaction of vaccines       For example  in the new Hollywood blockbuster  Inferno   a virus is engineered that causes infertility for the purpose of depopulation essentially  a Eugenics fantasy scenario  Likewise certain royalty agrees that viruses which can depopulate are desirable Prince Philip is quoted stating   In the event that I am reincarnated  I would like to return as a deadly virus  to contribute something to solving overpopulation   Well  a side effect of some virus loaded vaccines is sterility  So many agendas  so many toxins  all crammed into a piercing needle payload               Related  Dr  Oz promotes vaccines but won t give them to his own children)'> of type <class 'pyspark.sql.column.Column'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1892)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1880)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2113)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2062)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2051)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2557)
	at org.apache.spark.sql.Dataset.first(Dataset.scala:2564)
	at org.apache.spark.ml.feature.VectorAssembler$.getVectorLengthsFromFirstRow(VectorAssembler.scala:200)
	at org.apache.spark.ml.feature.VectorAssembler$.getLengths(VectorAssembler.scala:226)
	at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:96)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 83, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 685, in toInternal
    return self._cachedSqlType().toInternal(self.serialize(obj))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/linalg/__init__.py", line 167, in serialize
    raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
TypeError: cannot serialize Column<b'f(            Hamish  a contributor  remarks on how we are encouraged to examine what is in our food but criminalized for rejecting the cocktail of toxins contained in vaccines                       By Hamish   henrymakow com       The Establishment media has reported that there is a growing  anti vax  movement such an uptick in opposition to vaccines that the moniker  anti vaxxers  has entered the modern lexicon       In Ontario  school boards have warned parents  to update their kids  immunization records or face suspension   The provincial legislature has produced legislation that  would require anti vax parents to take an  education session  essentially a re education brainwash  The authoritarian posturing by the Establishment produces a societal chilling effect  people who either refuse vaccines or denounce them are ostracized by society and this kind of systematic shunning is a worldwide trend       In Australia  there is a  vaccine crackdown  announcing an end to religious exemptions and decreeing that  doctors will be given incentive payments so that parents stick to their children s vaccination schedule   In poorer countries  organizations like The Bill and Melinda Gates Foundation  the world s largest private foundation with assets in the billions  distributes vaccines to millions  Adverse consequences are rarely headlined       Major media outlets censure any expos\xc3\xa9 exploring the possibility of vaccine damage and corrupt Establishment collusion like the film Vaxxed  written and directed by Dr  Andrew Wakefield  that was pulled from the Tribeca Film Festival  Celebrity Hollywood stars are scorned like Jim Carrey and Jenny McCarthy for expressing concerns about vaccine safety  Even Donald Trump has stated that vaccines are  not helpful  and in turn has been ridiculed by the corporate media  Of course  Dr  Oz  the darling of that same corporate media  shills for the Establishment and peddles vaccine injections for the public rejecting any trepidation about the jabs       So  what can we make of this controversy  It is a fact that vaccine pamphlet inserts state that their product can cause a litany of side effects  including autism despite what CNN s Dr  Gupta says  I propose that when we look at vaccines we should use the same approach most people take when they decide what food and drink enters their body  After all  an injection enables many ingredients to enter the body and yet  for the most part  people take vaccines on blind faith  They have no idea what s in the shot and the  faithfully blind  usually don t even notice when later it s admitted a particular vaccine isn t effective  From this angle  I present some observations made by author Liam Scheff in his book Official Stories  2012    WHAT IS IN A VACCINE   The term  vaccination  comes from the Latin word  vacca   We could call it cow injection  to be true to history  because  and we re never really told this  but the vaccine that made it all famous smallpox came from the sores on the underbellies and legs of cows and horses  Pus and blood were scraped off  put on the ends of small  sharp pronged forks or lancets and jabbed into people s arms  Yes   vacca  means  cow   Does that surprise you  Did we really think that animal blood and pus was a good thing to put into our bodies  Vaccines are regarded as a nearly magical process  like a totem or a crucible  a station of the cross in the Western world  it has replaced baptism as a holy rite  Those who are opposed are mistrusted and feared  almost as witches  certainly as troubled heretics  But no one ever asks the question  what is in a vaccine       Vaccines are not conjured at Hogwarts by honest wizards  Willy Wonka doesn t brew them in his chocolate factory  They are not magical and there is a reason  or many  why some people oppose them so strongly  Vaccines are toxic  by their very nature  The liquid in the syringe is filled with very small pieces of well  a lot of things  These materials come from laboratory dishes where putative viruses are grown  But nothing biological can be grown  except in a  medium  or substrate  That is  it takes living tissue to grow living microscopic entities  So  what tissues are vaccines grown in  or really  culled from  It s a living cell matrix  The first substrates were a variety of animal body parts  including spines and brains  rabbits were often used  Sometimes it was pus and blood from a sick animal  Then it was monkey kidneys and testicles  that s what the putative polio virus was grown in  Of course  monkey cells contain monkey proteins  viruses  bacteria  mycoplasmas  and toxins  It is not possible to filter out one microscopic particle from a sea of similarly sized or smaller particles  These particles  proteins  viruses  and cellular debris have been and are being injected into millions of people  in the name of stopping polio and every other disease for which there is a vaccine       Hamster ovaries  washed sheep blood  dog kidney cells and here s a favorite with the Christian crowd aborted human fetal tissue  these are newer substrates  These cells are cultured  fed  stimulated and made to replicate  But you can t just siphon off fluid and cellular detritus from animal cells you have to mix it with chemicals  to insure that it s stable  that it doesn t quite rot and that it truly inflames and agitates the immune system  In addition to the living tissue  vaccines have added to them a series of metals and preservatives  as well as chemical agents sent to aggravate and disturb your cells  Mercury is one of the longest used metals in vaccines  It is a deadly neurotoxin  that on its own causes paralysis and death  as has been known since the time of the ancient Greeks  as recorded in Hippocrates  They also add aluminum  sucrose  MSG  phenol  aspartame and more  Formaldehyde has made it into countless batches  Formaldehyde is used to embalm dead people to keep them from rotting  Squalene  the infamous adjuvant linked to Gulf War Illness  is a more recent offering  Its job is to agitate your muscles  blood vessels  cells and tissue into an inflamed state  Vaccine manufacturers actively seek this inflammatory response  They feel it helps their vaccine work  But it can also bring on real illness  pain  nausea  cramps  fainting  tremors  seizures  and a long list of neurological response  Sometimes vaccines cause death  sometimes instantly  Yes  that has happened too  Is that good for children  No  it s a toxic poison  But there it goes  into the blood       CONCLUSION  It s increasingly popular to be aware of what food ingredients are entering our bodies look at the backlash against GMOs or the chemical azodicarbonamide  removed from Subway restaurant s sandwich bread after it was exposed that the chemical is also used to manufacture yoga mats and shoe soles   But if you mention any reservations about vaccine ingredients to the average person then they ll likely give you a blank stare in response that s the  blind faith  programming kicking in       Nevertheless  one thing is for certain  vaccine ingredients are toxic and inoculations actually spread viruses  so those concerns should be at least on par with our particular dietary intake qualms  Perhaps  in addition to an apprehensive stance on vaccination regarding the ingredients used  we should also take heed of the messages from Establishment sources on their peculiar affinity for the actual viruses especially for uses beyond the so called public benefaction of vaccines       For example  in the new Hollywood blockbuster  Inferno   a virus is engineered that causes infertility for the purpose of depopulation essentially  a Eugenics fantasy scenario  Likewise certain royalty agrees that viruses which can depopulate are desirable Prince Philip is quoted stating   In the event that I am reincarnated  I would like to return as a deadly virus  to contribute something to solving overpopulation   Well  a side effect of some virus loaded vaccines is sterility  So many agendas  so many toxins  all crammed into a piercing needle payload               Related  Dr  Oz promotes vaccines but won t give them to his own children)'> of type <class 'pyspark.sql.column.Column'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
%%time
lr_model = LogisticRegression(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction',
                              family='binomial', 
                              fitIntercept=True, 
                              threshold=0.5, 
                              standardization=False, 
                              maxIter=200, 
                              regParam=0.005, 
                              elasticNetParam=0, 
                              tol=1e-06, 
                              aggregationDepth=2)

lr_model = lr_model.fit(df_train_pca)
eval_model_perf(lr_model, df_test_pca)  

In [None]:
%%time
dt_model = DecisionTreeClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction', 
                                  maxDepth=10, maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10,
                                  impurity='gini', 
                                  seed=666)

dt_model = dt_model.fit(df_train_pca)
eval_model_perf(dt_model, df_test_pca)

In [None]:
%%time
rf_model = RandomForestClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction',
                                  maxDepth=10, 
                                  maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10, 
                                  impurity='gini', 
                                  numTrees=200, 
                                  featureSubsetStrategy='auto', 
                                  seed=666, 
                                  subsamplingRate=0.8)

rf_model = rf_model.fit(df_train_pca)
eval_model_perf(rf_model, df_test_pca)

In [None]:
%%time
gbt_model = GBTClassifier(featuresCol='features', 
                         labelCol='label', 
                         maxIter=250)

gbt_model = gbt_model.fit(df_train_pca)
eval_model_perf(gbt_model, df_test_pca)

In [None]:
%%time
mp_model = MultilayerPerceptronClassifier(featuresCol='features', 
                                          labelCol='label', 
                                          predictionCol='prediction', 
                                          layers=[4, 5, 4, 3],  
                                          maxIter=100, 
                                          blockSize=128, 
                                          seed=1234)    

    

mp_model = mp_model.fit(df_train_pca)
eval_model_perf(mp_model, df_test_pca)

In [None]:
%%time
print("**********Run Models without PCA Features**********")
# generate the features to be used for model training
preproc_model = build_data_preproc_model_without_pca(3000).fit(df_train)
df_train_wo_pca = preproc_model.transform(df_train).select("label", "features")
print(df_train_wo_pca.take(1))
df_test_wo_pca = preproc_model.transform(df_test).select("label", "features")
print(df_test_wo_pca.take(1))
run_models(df_train_wo_pca, df_test_wo_pca, without_pca = True)

**********Run Models without PCA Features**********


Py4JJavaError: An error occurred while calling o980.transform.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 38.0 failed 4 times, most recent failure: Lost task 0.3 in stage 38.0 (TID 1140, cluster-8866-w-4.asia-southeast1-b.c.weicheng.internal, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 83, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 685, in toInternal
    return self._cachedSqlType().toInternal(self.serialize(obj))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/linalg/__init__.py", line 167, in serialize
    raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
TypeError: cannot serialize Column<b'f(            Hamish  a contributor  remarks on how we are encouraged to examine what is in our food but criminalized for rejecting the cocktail of toxins contained in vaccines                       By Hamish   henrymakow com       The Establishment media has reported that there is a growing  anti vax  movement such an uptick in opposition to vaccines that the moniker  anti vaxxers  has entered the modern lexicon       In Ontario  school boards have warned parents  to update their kids  immunization records or face suspension   The provincial legislature has produced legislation that  would require anti vax parents to take an  education session  essentially a re education brainwash  The authoritarian posturing by the Establishment produces a societal chilling effect  people who either refuse vaccines or denounce them are ostracized by society and this kind of systematic shunning is a worldwide trend       In Australia  there is a  vaccine crackdown  announcing an end to religious exemptions and decreeing that  doctors will be given incentive payments so that parents stick to their children s vaccination schedule   In poorer countries  organizations like The Bill and Melinda Gates Foundation  the world s largest private foundation with assets in the billions  distributes vaccines to millions  Adverse consequences are rarely headlined       Major media outlets censure any expos\xc3\xa9 exploring the possibility of vaccine damage and corrupt Establishment collusion like the film Vaxxed  written and directed by Dr  Andrew Wakefield  that was pulled from the Tribeca Film Festival  Celebrity Hollywood stars are scorned like Jim Carrey and Jenny McCarthy for expressing concerns about vaccine safety  Even Donald Trump has stated that vaccines are  not helpful  and in turn has been ridiculed by the corporate media  Of course  Dr  Oz  the darling of that same corporate media  shills for the Establishment and peddles vaccine injections for the public rejecting any trepidation about the jabs       So  what can we make of this controversy  It is a fact that vaccine pamphlet inserts state that their product can cause a litany of side effects  including autism despite what CNN s Dr  Gupta says  I propose that when we look at vaccines we should use the same approach most people take when they decide what food and drink enters their body  After all  an injection enables many ingredients to enter the body and yet  for the most part  people take vaccines on blind faith  They have no idea what s in the shot and the  faithfully blind  usually don t even notice when later it s admitted a particular vaccine isn t effective  From this angle  I present some observations made by author Liam Scheff in his book Official Stories  2012    WHAT IS IN A VACCINE   The term  vaccination  comes from the Latin word  vacca   We could call it cow injection  to be true to history  because  and we re never really told this  but the vaccine that made it all famous smallpox came from the sores on the underbellies and legs of cows and horses  Pus and blood were scraped off  put on the ends of small  sharp pronged forks or lancets and jabbed into people s arms  Yes   vacca  means  cow   Does that surprise you  Did we really think that animal blood and pus was a good thing to put into our bodies  Vaccines are regarded as a nearly magical process  like a totem or a crucible  a station of the cross in the Western world  it has replaced baptism as a holy rite  Those who are opposed are mistrusted and feared  almost as witches  certainly as troubled heretics  But no one ever asks the question  what is in a vaccine       Vaccines are not conjured at Hogwarts by honest wizards  Willy Wonka doesn t brew them in his chocolate factory  They are not magical and there is a reason  or many  why some people oppose them so strongly  Vaccines are toxic  by their very nature  The liquid in the syringe is filled with very small pieces of well  a lot of things  These materials come from laboratory dishes where putative viruses are grown  But nothing biological can be grown  except in a  medium  or substrate  That is  it takes living tissue to grow living microscopic entities  So  what tissues are vaccines grown in  or really  culled from  It s a living cell matrix  The first substrates were a variety of animal body parts  including spines and brains  rabbits were often used  Sometimes it was pus and blood from a sick animal  Then it was monkey kidneys and testicles  that s what the putative polio virus was grown in  Of course  monkey cells contain monkey proteins  viruses  bacteria  mycoplasmas  and toxins  It is not possible to filter out one microscopic particle from a sea of similarly sized or smaller particles  These particles  proteins  viruses  and cellular debris have been and are being injected into millions of people  in the name of stopping polio and every other disease for which there is a vaccine       Hamster ovaries  washed sheep blood  dog kidney cells and here s a favorite with the Christian crowd aborted human fetal tissue  these are newer substrates  These cells are cultured  fed  stimulated and made to replicate  But you can t just siphon off fluid and cellular detritus from animal cells you have to mix it with chemicals  to insure that it s stable  that it doesn t quite rot and that it truly inflames and agitates the immune system  In addition to the living tissue  vaccines have added to them a series of metals and preservatives  as well as chemical agents sent to aggravate and disturb your cells  Mercury is one of the longest used metals in vaccines  It is a deadly neurotoxin  that on its own causes paralysis and death  as has been known since the time of the ancient Greeks  as recorded in Hippocrates  They also add aluminum  sucrose  MSG  phenol  aspartame and more  Formaldehyde has made it into countless batches  Formaldehyde is used to embalm dead people to keep them from rotting  Squalene  the infamous adjuvant linked to Gulf War Illness  is a more recent offering  Its job is to agitate your muscles  blood vessels  cells and tissue into an inflamed state  Vaccine manufacturers actively seek this inflammatory response  They feel it helps their vaccine work  But it can also bring on real illness  pain  nausea  cramps  fainting  tremors  seizures  and a long list of neurological response  Sometimes vaccines cause death  sometimes instantly  Yes  that has happened too  Is that good for children  No  it s a toxic poison  But there it goes  into the blood       CONCLUSION  It s increasingly popular to be aware of what food ingredients are entering our bodies look at the backlash against GMOs or the chemical azodicarbonamide  removed from Subway restaurant s sandwich bread after it was exposed that the chemical is also used to manufacture yoga mats and shoe soles   But if you mention any reservations about vaccine ingredients to the average person then they ll likely give you a blank stare in response that s the  blind faith  programming kicking in       Nevertheless  one thing is for certain  vaccine ingredients are toxic and inoculations actually spread viruses  so those concerns should be at least on par with our particular dietary intake qualms  Perhaps  in addition to an apprehensive stance on vaccination regarding the ingredients used  we should also take heed of the messages from Establishment sources on their peculiar affinity for the actual viruses especially for uses beyond the so called public benefaction of vaccines       For example  in the new Hollywood blockbuster  Inferno   a virus is engineered that causes infertility for the purpose of depopulation essentially  a Eugenics fantasy scenario  Likewise certain royalty agrees that viruses which can depopulate are desirable Prince Philip is quoted stating   In the event that I am reincarnated  I would like to return as a deadly virus  to contribute something to solving overpopulation   Well  a side effect of some virus loaded vaccines is sterility  So many agendas  so many toxins  all crammed into a piercing needle payload               Related  Dr  Oz promotes vaccines but won t give them to his own children)'> of type <class 'pyspark.sql.column.Column'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1892)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1880)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2113)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2062)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2051)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2557)
	at org.apache.spark.sql.Dataset.first(Dataset.scala:2564)
	at org.apache.spark.ml.feature.VectorAssembler$.getVectorLengthsFromFirstRow(VectorAssembler.scala:200)
	at org.apache.spark.ml.feature.VectorAssembler$.getLengths(VectorAssembler.scala:226)
	at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:96)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 83, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 685, in toInternal
    return self._cachedSqlType().toInternal(self.serialize(obj))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/linalg/__init__.py", line 167, in serialize
    raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
TypeError: cannot serialize Column<b'f(            Hamish  a contributor  remarks on how we are encouraged to examine what is in our food but criminalized for rejecting the cocktail of toxins contained in vaccines                       By Hamish   henrymakow com       The Establishment media has reported that there is a growing  anti vax  movement such an uptick in opposition to vaccines that the moniker  anti vaxxers  has entered the modern lexicon       In Ontario  school boards have warned parents  to update their kids  immunization records or face suspension   The provincial legislature has produced legislation that  would require anti vax parents to take an  education session  essentially a re education brainwash  The authoritarian posturing by the Establishment produces a societal chilling effect  people who either refuse vaccines or denounce them are ostracized by society and this kind of systematic shunning is a worldwide trend       In Australia  there is a  vaccine crackdown  announcing an end to religious exemptions and decreeing that  doctors will be given incentive payments so that parents stick to their children s vaccination schedule   In poorer countries  organizations like The Bill and Melinda Gates Foundation  the world s largest private foundation with assets in the billions  distributes vaccines to millions  Adverse consequences are rarely headlined       Major media outlets censure any expos\xc3\xa9 exploring the possibility of vaccine damage and corrupt Establishment collusion like the film Vaxxed  written and directed by Dr  Andrew Wakefield  that was pulled from the Tribeca Film Festival  Celebrity Hollywood stars are scorned like Jim Carrey and Jenny McCarthy for expressing concerns about vaccine safety  Even Donald Trump has stated that vaccines are  not helpful  and in turn has been ridiculed by the corporate media  Of course  Dr  Oz  the darling of that same corporate media  shills for the Establishment and peddles vaccine injections for the public rejecting any trepidation about the jabs       So  what can we make of this controversy  It is a fact that vaccine pamphlet inserts state that their product can cause a litany of side effects  including autism despite what CNN s Dr  Gupta says  I propose that when we look at vaccines we should use the same approach most people take when they decide what food and drink enters their body  After all  an injection enables many ingredients to enter the body and yet  for the most part  people take vaccines on blind faith  They have no idea what s in the shot and the  faithfully blind  usually don t even notice when later it s admitted a particular vaccine isn t effective  From this angle  I present some observations made by author Liam Scheff in his book Official Stories  2012    WHAT IS IN A VACCINE   The term  vaccination  comes from the Latin word  vacca   We could call it cow injection  to be true to history  because  and we re never really told this  but the vaccine that made it all famous smallpox came from the sores on the underbellies and legs of cows and horses  Pus and blood were scraped off  put on the ends of small  sharp pronged forks or lancets and jabbed into people s arms  Yes   vacca  means  cow   Does that surprise you  Did we really think that animal blood and pus was a good thing to put into our bodies  Vaccines are regarded as a nearly magical process  like a totem or a crucible  a station of the cross in the Western world  it has replaced baptism as a holy rite  Those who are opposed are mistrusted and feared  almost as witches  certainly as troubled heretics  But no one ever asks the question  what is in a vaccine       Vaccines are not conjured at Hogwarts by honest wizards  Willy Wonka doesn t brew them in his chocolate factory  They are not magical and there is a reason  or many  why some people oppose them so strongly  Vaccines are toxic  by their very nature  The liquid in the syringe is filled with very small pieces of well  a lot of things  These materials come from laboratory dishes where putative viruses are grown  But nothing biological can be grown  except in a  medium  or substrate  That is  it takes living tissue to grow living microscopic entities  So  what tissues are vaccines grown in  or really  culled from  It s a living cell matrix  The first substrates were a variety of animal body parts  including spines and brains  rabbits were often used  Sometimes it was pus and blood from a sick animal  Then it was monkey kidneys and testicles  that s what the putative polio virus was grown in  Of course  monkey cells contain monkey proteins  viruses  bacteria  mycoplasmas  and toxins  It is not possible to filter out one microscopic particle from a sea of similarly sized or smaller particles  These particles  proteins  viruses  and cellular debris have been and are being injected into millions of people  in the name of stopping polio and every other disease for which there is a vaccine       Hamster ovaries  washed sheep blood  dog kidney cells and here s a favorite with the Christian crowd aborted human fetal tissue  these are newer substrates  These cells are cultured  fed  stimulated and made to replicate  But you can t just siphon off fluid and cellular detritus from animal cells you have to mix it with chemicals  to insure that it s stable  that it doesn t quite rot and that it truly inflames and agitates the immune system  In addition to the living tissue  vaccines have added to them a series of metals and preservatives  as well as chemical agents sent to aggravate and disturb your cells  Mercury is one of the longest used metals in vaccines  It is a deadly neurotoxin  that on its own causes paralysis and death  as has been known since the time of the ancient Greeks  as recorded in Hippocrates  They also add aluminum  sucrose  MSG  phenol  aspartame and more  Formaldehyde has made it into countless batches  Formaldehyde is used to embalm dead people to keep them from rotting  Squalene  the infamous adjuvant linked to Gulf War Illness  is a more recent offering  Its job is to agitate your muscles  blood vessels  cells and tissue into an inflamed state  Vaccine manufacturers actively seek this inflammatory response  They feel it helps their vaccine work  But it can also bring on real illness  pain  nausea  cramps  fainting  tremors  seizures  and a long list of neurological response  Sometimes vaccines cause death  sometimes instantly  Yes  that has happened too  Is that good for children  No  it s a toxic poison  But there it goes  into the blood       CONCLUSION  It s increasingly popular to be aware of what food ingredients are entering our bodies look at the backlash against GMOs or the chemical azodicarbonamide  removed from Subway restaurant s sandwich bread after it was exposed that the chemical is also used to manufacture yoga mats and shoe soles   But if you mention any reservations about vaccine ingredients to the average person then they ll likely give you a blank stare in response that s the  blind faith  programming kicking in       Nevertheless  one thing is for certain  vaccine ingredients are toxic and inoculations actually spread viruses  so those concerns should be at least on par with our particular dietary intake qualms  Perhaps  in addition to an apprehensive stance on vaccination regarding the ingredients used  we should also take heed of the messages from Establishment sources on their peculiar affinity for the actual viruses especially for uses beyond the so called public benefaction of vaccines       For example  in the new Hollywood blockbuster  Inferno   a virus is engineered that causes infertility for the purpose of depopulation essentially  a Eugenics fantasy scenario  Likewise certain royalty agrees that viruses which can depopulate are desirable Prince Philip is quoted stating   In the event that I am reincarnated  I would like to return as a deadly virus  to contribute something to solving overpopulation   Well  a side effect of some virus loaded vaccines is sterility  So many agendas  so many toxins  all crammed into a piercing needle payload               Related  Dr  Oz promotes vaccines but won t give them to his own children)'> of type <class 'pyspark.sql.column.Column'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
%%time
lr_model = LogisticRegression(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction',
                              family='binomial', 
                              fitIntercept=True, 
                              threshold=0.5, 
                              standardization=False, 
                              maxIter=200, 
                              regParam=0.005, 
                              elasticNetParam=0, 
                              tol=1e-06, 
                              aggregationDepth=2)

lr_model = lr_model.fit(df_train_wo_pca)    
eval_model_perf(lr_model, df_test_wo_pca)

In [None]:
%%time
dt_model = DecisionTreeClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction', 
                                  maxDepth=10, maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10,
                                  impurity='gini', 
                                  seed=666)

dt_model = dt_model.fit(df_train_wo_pca)
eval_model_perf(dt_model, df_test_wo_pca)

In [None]:
%%time
rf_model = RandomForestClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction',
                                  maxDepth=10, 
                                  maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10, 
                                  impurity='gini', 
                                  numTrees=200, 
                                  featureSubsetStrategy='auto', 
                                  seed=666, 
                                  subsamplingRate=0.8)

rf_model = rf_model.fit(df_train_wo_pca)
eval_model_perf(rf_model, df_test_wo_pca)

In [None]:
%%time
gbt_model = GBTClassifier(featuresCol='features', 
                         labelCol='label', 
                         maxIter=250)

gbt_model = gbt_model.fit(df_train_wo_pca)
eval_model_perf(gbt_model, df_test_wo_pca)

In [None]:
%%time
mp_model = MultilayerPerceptronClassifier(featuresCol='features', 
                                          labelCol='label', 
                                          predictionCol='prediction', 
                                          layers=[4, 5, 4, 3],  
                                          maxIter=100, 
                                          blockSize=128, 
                                          seed=1234)

mp_model = mp_model.fit(df_train_wo_pca)
eval_model_perf(mp_model, df_test_wo_pca)

In [None]:
%%time
nb_model = NaiveBayes(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction', 
                              smoothing=1, 
                              modelType='multinomial')

nb_model = nb_model.fit(df_train_wo_pca)
eval_model_perf(nb_model, df_test_wo_pca)