In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('original_NB').getOrCreate()

In [3]:
start_data = spark.read.format("csv").option("header", "true").load("NYT_Articles_Combined_class.csv")
start_data.show()

+---+-------+-------+--------------------+--------------------+----+
|_c0|Country|  class|           Headlines|            Snippets|Year|
+---+-------+-------+--------------------+--------------------+----+
|  0|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|
|  1|Burundi|Unhappy|We’re Not Done Ye...|The International...|2017|
|  2|Burundi|Unhappy|U.N. Group Accuse...|A panel of invest...|2017|
|  3|Burundi|Unhappy|Raising Fears of ...|The country’s low...|2016|
|  4|Burundi|Unhappy|Burundi Robotics ...|The police say fo...|2017|
|  5|Burundi|Unhappy|Assassination in ...|The environment m...|2017|
|  6|Burundi|Unhappy|Jewish Philanthro...|“Love the strange...|2017|
|  7|Burundi|Unhappy|Burundians, Fleei...|It is the final s...|2017|
|  8|Burundi|Unhappy|36 Burundian Refu...|The Burundians ha...|2017|
|  9|Burundi|Unhappy|Canada Letter: Sp...|A weekly collecti...|2017|
| 10|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|
| 11|Burundi|Unhappy|We’re Not Don

In [4]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['Snippets']))
data.show()

+---+-------+-------+--------------------+--------------------+----+------+
|_c0|Country|  class|           Headlines|            Snippets|Year|length|
+---+-------+-------+--------------------+--------------------+----+------+
|  0|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|   143|
|  1|Burundi|Unhappy|We’re Not Done Ye...|The International...|2017|   148|
|  2|Burundi|Unhappy|U.N. Group Accuse...|A panel of invest...|2017|   167|
|  3|Burundi|Unhappy|Raising Fears of ...|The country’s low...|2016|   167|
|  4|Burundi|Unhappy|Burundi Robotics ...|The police say fo...|2017|   147|
|  5|Burundi|Unhappy|Assassination in ...|The environment m...|2017|   156|
|  6|Burundi|Unhappy|Jewish Philanthro...|“Love the strange...|2017|   140|
|  7|Burundi|Unhappy|Burundians, Fleei...|It is the final s...|2017|   149|
|  8|Burundi|Unhappy|36 Burundian Refu...|The Burundians ha...|2017|   143|
|  9|Burundi|Unhappy|Canada Letter: Sp...|A weekly collecti...|2017|    70|
| 10|Burundi

In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
happy_unhappy_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="Snippets", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [7]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[happy_unhappy_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [8]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [9]:
# Show label of happy unhappy and resulting features
cleaned.select(['label', 'features']).show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+-----+---------------------------------------------------------------------------------------------------------------

In [10]:
# Break data down into a training set and a testing set
(training, testing) = cleaned.randomSplit([0.7, 0.3])

In [11]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model and fit training data
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
unhappy_predictor = nb.fit(training)

In [12]:
# Tranform the model with the testing data
test_results = unhappy_predictor.transform(testing)
test_results.show(5)

+---+-------+-------+--------------------+--------------------+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|_c0|Country|  class|           Headlines|            Snippets|Year|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+---+-------+-------+--------------------+--------------------+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|Denmark|  Happy|36 Hours in Aarhu...|This small and sc...|2017|   137|  0.0|[this, small, and...|[small, scholarly...|(262144,[17371,49...|(262144,[17371,49...|(262145,[17371,49...|[-364.91376937458...|[1.0,1.2612110817...|       0.0|
|  1| Norway|  Happy|In Certifiably Ha..

In [13]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting unahhpiness was: {acc}")

Accuracy of model at predicting unahhpiness was: 1.0


In [2]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification


In [None]:
plt.title("Two informative features, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2,
                             n_clusters_per_class=1)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')
plt.show()

In [14]:
spark.stop()