In [118]:
from pyspark.sql import SparkSession

In [119]:
spark = SparkSession.builder.appName('original_NB').getOrCreate()

In [121]:
start_data = spark.read.format("csv").option("header", "true").load("NYT_Articles_Combined_class.csv")
start_data.show()

+---+-------+-------+--------------------+--------------------+----+
|_c0|Country|  class|           Headlines|            Snippets|Year|
+---+-------+-------+--------------------+--------------------+----+
|  0|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|
|  1|Burundi|Unhappy|We’re Not Done Ye...|The International...|2017|
|  2|Burundi|Unhappy|U.N. Group Accuse...|A panel of invest...|2017|
|  3|Burundi|Unhappy|Raising Fears of ...|The country’s low...|2016|
|  4|Burundi|Unhappy|Burundi Robotics ...|The police say fo...|2017|
|  5|Burundi|Unhappy|Assassination in ...|The environment m...|2017|
|  6|Burundi|Unhappy|Jewish Philanthro...|“Love the strange...|2017|
|  7|Burundi|Unhappy|Burundians, Fleei...|It is the final s...|2017|
|  8|Burundi|Unhappy|36 Burundian Refu...|The Burundians ha...|2017|
|  9|Burundi|Unhappy|Canada Letter: Sp...|A weekly collecti...|2017|
| 10|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|
| 11|Burundi|Unhappy|We’re Not Don

In [122]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['Snippets']))
data.show()

+---+-------+-------+--------------------+--------------------+----+------+
|_c0|Country|  class|           Headlines|            Snippets|Year|length|
+---+-------+-------+--------------------+--------------------+----+------+
|  0|Burundi|Unhappy|Burundi Quits Int...|It became the fir...|2017|   143|
|  1|Burundi|Unhappy|We’re Not Done Ye...|The International...|2017|   148|
|  2|Burundi|Unhappy|U.N. Group Accuse...|A panel of invest...|2017|   167|
|  3|Burundi|Unhappy|Raising Fears of ...|The country’s low...|2016|   167|
|  4|Burundi|Unhappy|Burundi Robotics ...|The police say fo...|2017|   147|
|  5|Burundi|Unhappy|Assassination in ...|The environment m...|2017|   156|
|  6|Burundi|Unhappy|Jewish Philanthro...|“Love the strange...|2017|   140|
|  7|Burundi|Unhappy|Burundians, Fleei...|It is the final s...|2017|   149|
|  8|Burundi|Unhappy|36 Burundian Refu...|The Burundians ha...|2017|   143|
|  9|Burundi|Unhappy|Canada Letter: Sp...|A weekly collecti...|2017|    70|
| 10|Burundi

In [123]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
happy_unhappy_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="Snippets", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [124]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [125]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[happy_unhappy_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [126]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [127]:
# Show label of happy unhappy and resulting features
cleaned.select(['label', 'features']).show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+-----+---------------------------------------------------------------------------------------------------------------

In [128]:
# Break data down into a training set and a testing set
(training, testing) = cleaned.randomSplit([0.7, 0.3])

In [129]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model and fit training data
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
unhappy_predictor = nb.fit(training)

In [130]:
# Tranform the model with the testing data
test_results = unhappy_predictor.transform(testing)
test_results.show(1)

+---+-------+-----+--------------------+--------------------+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|_c0|Country|class|           Headlines|            Snippets|Year|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+---+-------+-----+--------------------+--------------------+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0| Norway|Happy|In Norway, Fighti...|A young indigenou...|2017|   183|  0.0|[a, young, indige...|[young, indigenou...|(262144,[19675,47...|(262144,[19675,47...|(262145,[19675,47...|[-465.96967057829...|[1.0,4.4214595090...|       0.0|
+---+-------+-----+--------------------+--------

In [131]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting unahhpiness was: {acc}")

Accuracy of model at predicting unahhpiness was: 1.0


In [132]:
test_results

DataFrame[_c0: string, Country: string, class: string, Headlines: string, Snippets: string, Year: string, length: int, label: double, token_text: array<string>, stop_tokens: array<string>, hash_token: vector, idf_token: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [133]:
# read other countries csv
start_data = spark.read.format("csv").option("header", "true").load("Data/NYT Articles/iceland_NYT_Data.csv")
start_data.show()

+---+--------------------+--------------------+----+-------+-----+
|_c0|           Headlines|            Snippets|Year|Country|class|
+---+--------------------+--------------------+----+-------+-----+
|  0|Iceland’s Governm...|The prime ministe...|2017|iceland|happy|
|  1|Can You Find Icel...|What else do you ...|2017|iceland|happy|
|  2|Vikings Razed the...|The country lost ...|2017|iceland|happy|
|  3|Iceland’s Indepen...|But the conservat...|2017|iceland|happy|
|  4|Iceland Goes to P...|As Iceland votes ...|2017|iceland|happy|
|  5|Iceland Becomes S...|Iceland made a st...|2017|iceland|happy|
|  6|Deadly Deeds, Fro...|New crime novels ...|2017|iceland|happy|
|  7|American Companie...|The Trump adminis...|2017|iceland|happy|
|  8|Review: ‘I Rememb...|The director Oska...|2017|iceland|happy|
|  9|Cool Cruises Arou...|Glaciers, fjords,...|2017|iceland|happy|
| 10|Iceland’s Governm...|The prime ministe...|2017|iceland|happy|
| 11|Can You Find Icel...|What else do you ...|2017|iceland|ha

In [134]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['Snippets']))
data.show()

+---+--------------------+--------------------+----+-------+-----+------+
|_c0|           Headlines|            Snippets|Year|Country|class|length|
+---+--------------------+--------------------+----+-------+-----+------+
|  0|Iceland’s Governm...|The prime ministe...|2017|iceland|happy|   139|
|  1|Can You Find Icel...|What else do you ...|2017|iceland|happy|    98|
|  2|Vikings Razed the...|The country lost ...|2017|iceland|happy|   104|
|  3|Iceland’s Indepen...|But the conservat...|2017|iceland|happy|   116|
|  4|Iceland Goes to P...|As Iceland votes ...|2017|iceland|happy|   126|
|  5|Iceland Becomes S...|Iceland made a st...|2017|iceland|happy|   153|
|  6|Deadly Deeds, Fro...|New crime novels ...|2017|iceland|happy|   147|
|  7|American Companie...|The Trump adminis...|2017|iceland|happy|   118|
|  8|Review: ‘I Rememb...|The director Oska...|2017|iceland|happy|   137|
|  9|Cool Cruises Arou...|Glaciers, fjords,...|2017|iceland|happy|    82|
| 10|Iceland’s Governm...|The prime mi

In [135]:
# Create all the features to the data set
happy_unhappy_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="Snippets", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [136]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [137]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[happy_unhappy_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [138]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [139]:
# Show label of happy unhappy and resulting features
cleaned.select(['label', 'features']).show(truncate=False)

+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----+-------------------------------------------------------------------------------------------------------------

In [140]:
# Tranform the model with the testing data
test_results = unhappy_predictor.transform(cleaned)
test_results.show()

+---+--------------------+--------------------+----+-------+-----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|_c0|           Headlines|            Snippets|Year|Country|class|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+---+--------------------+--------------------+----+-------+-----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|Iceland’s Governm...|The prime ministe...|2017|iceland|happy|   139|  0.0|[the, prime, mini...|[prime, minister,...|(262144,[32927,33...|(262144,[32927,33...|(262145,[32927,33...|[-388.31456811089...|[1.0,1.6060848403...|       0.0|
|  1|Can You Find Icel...|What else do you ...|2

In [141]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting unahhpiness was: {acc}")

Accuracy of model at predicting unahhpiness was: 1.0


In [142]:
spark.stop()