In [1]:
%%time
import pandas as pd
import os, sys, time, json, re, string, datetime

from pyspark import SparkContext, SparkConf, StorageLevel, keyword_only
from pyspark.sql.types import IntegerType

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.param.shared import HasInputCol, HasInputCols, HasOutputCol, HasOutputCols, Param
from pyspark.ml.feature import OneHotEncoder, HashingTF, IDF, Tokenizer, RegexTokenizer, NGram, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler, PCA, OneHotEncoderEstimator,StringIndexer

from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.classification import GBTClassifier, MultilayerPerceptronClassifier

from pyspark.ml import Pipeline, Transformer

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from platform import python_version
print(python_version())

3.6.10
CPU times: user 70.8 ms, sys: 7.13 ms, total: 77.9 ms
Wall time: 77.5 ms


In [2]:
%%time
sc

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.54 µs


In [3]:
%%time
spark = SparkSession.builder \
        .appName("fakenews") \
        .config("spark.master", "yarn") \
        .config("spark.submit.deployMode", "cluster") \
        .config("spark.driver.memory", "25g") \
        .config("spark.executor.instances", "5") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.memory", "25g") \
        .getOrCreate()

CPU times: user 5.79 ms, sys: 298 µs, total: 6.09 ms
Wall time: 9.77 ms


In [4]:
%%time
df_fake_train = pd.read_csv('gs://dataproc-6ca41800-27b4-47d5-abee-55c011dfa389-asia-southeast1/data/fake-news/100k_fake_news_cleaned_dataset.csv')
df_fake_train[['authors_missing']] = df_fake_train[['authors_missing']].astype(int)
df_fake_train['label'] = 1
df_reliable_train = pd.read_csv('gs://dataproc-6ca41800-27b4-47d5-abee-55c011dfa389-asia-southeast1/data/fake-news/100k_reliable_news_cleaned_dataset.csv')
df_reliable_train[['authors_missing']] = df_reliable_train[['authors_missing']].astype(int)
df_reliable_train['label'] = 0
df_news_train = pd.concat([df_fake_train, df_reliable_train])
df_news_train[df_news_train.columns] = df_news_train[df_news_train.columns].astype(str)
print(df_news_train.info())
df_news_train

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217806 entries, 0 to 108010
Data columns (total 7 columns):
domain             217806 non-null object
type               217806 non-null object
content            217806 non-null object
title              217806 non-null object
authors            217806 non-null object
authors_missing    217806 non-null object
label              217806 non-null object
dtypes: object(7)
memory usage: 13.3+ MB
None
CPU times: user 10.3 s, sys: 3.97 s, total: 14.3 s
Wall time: 34.6 s


Unnamed: 0,domain,type,content,title,authors,authors_missing,label
0,beforeitsnews.com,fake,Boehner presses to make tax cuts permanent\n\n...,Boehner presses to make tax cuts permanent,United Liberty,0,1
1,beforeitsnews.com,fake,An Armed Good Samaritan Who Doesn’t Want The S...,An Armed Good Samaritan Who Doesn’t Want The S...,The Real Revo,0,1
2,beforeitsnews.com,fake,(Before It's News)\n\nby Rob Morphy\n\nLegends...,Village Of The Dead: The Anjikuni Mystery,"Rob Morphy, Mort Amsel",0,1
3,teaparty.org,fake,(Breitbart) – With his hysterical gotcha attac...,Trump Calls Out Race-Baiting ABC News Reporter,,1,1
4,beforeitsnews.com,fake,Researchers develop new depression diagnosis a...,Researchers develop new depression diagnosis a...,Bel Marra Health,0,1
...,...,...,...,...,...,...,...
108006,sports.yahoo.com,reliable,"View photos\nMichigan guard Zak Irvin, right, ...",No. 25 Michigan beats Mount St. Mary's 64-47,The Associated Press,0,0
108007,uk.finance.yahoo.com,reliable,President-elect Donald Trump continued his scr...,"Trump quotes Hillary Clinton, rages against Wi...",Maxwell Tani,0,0
108008,www.yahoo.com,reliable,Holiday shoppers eager to snag big discounts t...,Dialing up deals: Black Friday online sales hi...,ALEX VEIGA,0,0
108009,www.reuters.com,reliable,Kabul police raid shisha cafes in crackdown on...,Kabul police raid shisha cafes in crackdown on...,,1,0


In [5]:
%%time
df_news = spark.createDataFrame(df_news_train)
df_news = df_news.withColumn("label", df_news["label"].cast(IntegerType()))
df_news = df_news.withColumn("authors_missing", df_news["authors_missing"].cast(IntegerType()))
df_news.show(10)

+-----------------+----+--------------------+--------------------+--------------------+---------------+-----+
|           domain|type|             content|               title|             authors|authors_missing|label|
+-----------------+----+--------------------+--------------------+--------------------+---------------+-----+
|beforeitsnews.com|fake|Boehner presses t...|Boehner presses t...|      United Liberty|              0|    1|
|beforeitsnews.com|fake|An Armed Good Sam...|An Armed Good Sam...|       The Real Revo|              0|    1|
|beforeitsnews.com|fake|(Before It's News...|Village Of The De...|Rob Morphy, Mort ...|              0|    1|
|     teaparty.org|fake|(Breitbart) – Wit...|Trump Calls Out R...|                 nan|              1|    1|
|beforeitsnews.com|fake|Researchers devel...|Researchers devel...|    Bel Marra Health|              0|    1|
|beforeitsnews.com|fake|Trading Watch Lis...|Trading Watch Lis...|Bulls On Wall Street|              0|    1|
|beforeits

In [6]:
%%time
#remove empty content which will cause problem when transform the text
df_news = df_news.filter(df_news.content != "")
df_news = df_news.filter(df_news.title != "")
df_news = df_news.dropDuplicates(['label', 'content', 'title', 'authors_missing'])

CPU times: user 6.63 ms, sys: 163 µs, total: 6.79 ms
Wall time: 401 ms


In [7]:
# only keep type and content
df_news = df_news.select('label', 'content', 'title','authors_missing')

# split the dataset
df_train, df_test = df_news.randomSplit([0.8, 0.2], seed=666)
param_tuning = False

In [8]:
%%time
# customized transformer class to manually extract some counting based text features
class ReviewContentTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol="content", outputCol="content_features"):
        super(ReviewContentTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)


    def _transform(self, dataset):
        
        def f(s):
            uppercase_count = 0
            char_count = 0
            for c in s:                
                if c in string.ascii_uppercase:
                    uppercase_count += 1
                    char_count += 1
                elif c in string.ascii_lowercase:
                    char_count += 1
            
            text_len = len(s)
            return Vectors.dense(text_len, char_count, 
                                 uppercase_count, uppercase_count / (char_count + 1e-10))

        return dataset.withColumn(self.getOutputCol(), 
                                  F.udf(f, VectorUDT())(dataset[self.getInputCol()]))

CPU times: user 93 µs, sys: 19 µs, total: 112 µs
Wall time: 117 µs


In [9]:
%%time
# customized transformer class to manually extract some counting based word features
class ReviewWordsTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol="content", outputCol="content_features"):
        super(ReviewWordsTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)


    def _transform(self, dataset):
        
        def f(words):    
            word_count = len(words)
            unique_word_count = len(set(words))
            upper_words = []
            for w in words:
                if w.isupper():
                    upper_words.append(w)
            upper_word_count = len(set(upper_words))
            unique_upper_word_count = len(upper_words)
            return Vectors.dense(word_count, unique_word_count, unique_word_count / (word_count + 1e-10),
                                 upper_word_count, upper_word_count / (word_count + 1e-10), 
                                 unique_upper_word_count, unique_upper_word_count / (upper_word_count + 1e-10))

        return dataset.withColumn(self.getOutputCol(), 
                                  F.udf(f, VectorUDT())(dataset[self.getInputCol()]))

CPU times: user 68 µs, sys: 14 µs, total: 82 µs
Wall time: 85.8 µs


In [10]:
%%time
# show model prediction performance on the given dataset
def eval_model_perf(fitted_model, dataset, label_col="label", prediction_col="prediction", probability_col="probability"):
    pred_dataset = fitted_model.transform(dataset)
    eval_dataset = pred_dataset.select(label_col, prediction_col, probability_col)
    # model performance evaluation
    metricNames = ["accuracy", "f1"]
    model_eval = MulticlassClassificationEvaluator(predictionCol=prediction_col, labelCol=label_col)
    for m in metricNames:
        val = model_eval.evaluate(eval_dataset, {model_eval.metricName: m})
        print(m, " = ", val)
    roc_eval = BinaryClassificationEvaluator(rawPredictionCol=probability_col, labelCol=label_col, metricName="areaUnderROC")
    print("AUC =", roc_eval.evaluate(eval_dataset))    
    return pred_dataset

# show CV param tunning result
def show_cv_results(cv_model):
    for result, param in sorted(zip(cv_model.avgMetrics, cv_model.getEstimatorParamMaps()), reverse=True, key=lambda x: x[0]):
        print(result, " | ", param)

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 14.5 µs


In [11]:
%%time
def build_data_preproc_model_with_pca(vocab_size=5000):
    preproc_steps = [
        RegexTokenizer(inputCol="content", outputCol="all_words", pattern=r"\W"),
        StopWordsRemover(inputCol="all_words", outputCol="words"),
        CountVectorizer(inputCol="words", outputCol="tf_features", vocabSize=vocab_size),
        IDF(inputCol="tf_features", outputCol="tfidf_features"),
        PCA(inputCol="tfidf_features", outputCol="pca_features", k=100),
        
        ReviewContentTransformer(inputCol="content", outputCol="content_features"),
        ReviewWordsTransformer(inputCol="words", outputCol="word_features"),
        
        RegexTokenizer(inputCol="title", outputCol="all_title_words", pattern=r"\W"),
        StopWordsRemover(inputCol="all_title_words", outputCol="title_words"),
        CountVectorizer(inputCol="title_words", outputCol="title_tf_features", vocabSize=100),
        IDF(inputCol="title_tf_features", outputCol="title_tfidf_features"),
        PCA(inputCol="title_tfidf_features", outputCol="title_pca_features", k=100),   
        
        StringIndexer(inputCol="authors_missing", outputCol="authors_missing_indexed", handleInvalid='keep'),
        OneHotEncoder(inputCol="authors_missing_indexed", outputCol="authors_missing_feature"),
        
        VectorAssembler(inputCols=["pca_features", "title_pca_features", "title_tfidf_features", 
                                   "content_features", "word_features", "authors_missing_feature"], 
                        outputCol="features")
    ]
    return Pipeline(stages=preproc_steps)

def build_data_preproc_model_without_pca(vocab_size=5000):
    preproc_steps = [
        RegexTokenizer(inputCol="content", outputCol="all_words", pattern=r"\W"),
        StopWordsRemover(inputCol="all_words", outputCol="words"),
        
        StringIndexer(inputCol="authors_missing", outputCol="authors_missing_indexed", handleInvalid='keep'),
        OneHotEncoder(inputCol="authors_missing_indexed", outputCol="authors_missing_feature"),
        
        ReviewContentTransformer(inputCol="content", outputCol="content_features"),
        ReviewWordsTransformer(inputCol="words", outputCol="word_features"),
        
        VectorAssembler(inputCols=["tf_features", "content_features", 
                                   "word_features", "authors_missing_feature"],
                        outputCol="features")
    ]
    return Pipeline(stages=preproc_steps)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.06 µs


In [12]:
%%time
print("**********Run Models with PCA Features**********")
# generate the features to be used for model training
preproc_model = build_data_preproc_model_with_pca(2000).fit(df_train)
df_train_pca = preproc_model.transform(df_train).select("label", "features")
df_test_pca = preproc_model.transform(df_test).select("label", "features")

**********Run Models with PCA Features**********


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 34444)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
  File "/opt/conda/anaconda/lib/python3.6/socketserver.py", line 320, in 

Py4JError: An error occurred while calling o181.fit

In [13]:
%%time
lr_model = LogisticRegression(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction',
                              family='binomial', 
                              fitIntercept=True, 
                              threshold=0.5, 
                              standardization=False, 
                              maxIter=200, 
                              regParam=0.005, 
                              elasticNetParam=0, 
                              tol=1e-06, 
                              aggregationDepth=2)

lr_model = lr_model.fit(df_train_pca)
eval_model_perf(lr_model, df_test_pca)  

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occur

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [14]:
%%time
dt_model = DecisionTreeClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction', 
                                  maxDepth=10, maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10,
                                  impurity='gini', 
                                  seed=666)

dt_model = dt_model.fit(df_train_pca)
eval_model_perf(dt_model, df_test_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [15]:
%%time
rf_model = RandomForestClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction',
                                  maxDepth=10, 
                                  maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10, 
                                  impurity='gini', 
                                  numTrees=200, 
                                  featureSubsetStrategy='auto', 
                                  seed=666, 
                                  subsamplingRate=0.8)

rf_model = rf_model.fit(df_train_pca)
eval_model_perf(rf_model, df_test_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [16]:
%%time
gbt_model = GBTClassifier(featuresCol='features', 
                         labelCol='label', 
                         maxIter=250)

gbt_model = gbt_model.fit(df_train_pca)
eval_model_perf(gbt_model, df_test_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [17]:
%%time
print("**********Run Models without PCA Features**********")
# generate the features to be used for model training
preproc_model = build_data_preproc_model_without_pca(3000).fit(df_train)
df_train_wo_pca = preproc_model.transform(df_train).select("label", "features")
df_test_wo_pca = preproc_model.transform(df_test).select("label", "features")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


**********Run Models without PCA Features**********


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [18]:
%%time
lr_model = LogisticRegression(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction',
                              family='binomial', 
                              fitIntercept=True, 
                              threshold=0.5, 
                              standardization=False, 
                              maxIter=200, 
                              regParam=0.005, 
                              elasticNetParam=0, 
                              tol=1e-06, 
                              aggregationDepth=2)

lr_model = lr_model.fit(df_train_wo_pca)    
eval_model_perf(lr_model, df_test_wo_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [19]:
%%time
dt_model = DecisionTreeClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction', 
                                  maxDepth=10, maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10,
                                  impurity='gini', 
                                  seed=666)

dt_model = dt_model.fit(df_train_wo_pca)
eval_model_perf(dt_model, df_test_wo_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [20]:
%%time
rf_model = RandomForestClassifier(featuresCol='features', 
                                  labelCol='label', 
                                  predictionCol='prediction', 
                                  probabilityCol='probability', 
                                  rawPredictionCol='rawPrediction',
                                  maxDepth=10, 
                                  maxBins=32, 
                                  minInstancesPerNode=1, 
                                  minInfoGain=0.0, 
                                  maxMemoryInMB=2048, 
                                  cacheNodeIds=True, 
                                  checkpointInterval=10, 
                                  impurity='gini', 
                                  numTrees=200, 
                                  featureSubsetStrategy='auto', 
                                  seed=666, 
                                  subsamplingRate=0.8)

rf_model = rf_model.fit(df_train_wo_pca)
eval_model_perf(rf_model, df_test_wo_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [21]:
%%time
gbt_model = GBTClassifier(featuresCol='features', 
                         labelCol='label', 
                         maxIter=250)

gbt_model = gbt_model.fit(df_train_wo_pca)
eval_model_perf(gbt_model, df_test_wo_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)

In [22]:
%%time
nb_model = NaiveBayes(featuresCol='features', 
                              labelCol='label', 
                              predictionCol='prediction', 
                              probabilityCol='probability', 
                              rawPredictionCol='rawPrediction', 
                              smoothing=1, 
                              modelType='multinomial')

nb_model = nb_model.fit(df_train_wo_pca)
eval_model_perf(nb_model, df_test_wo_pca)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39983)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39983)