In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
data_true = spark.read.csv("Du lieu cung cap\Fake-and-real-news-dataset\True.csv", inferSchema=True, sep=',', header=True)
data_fake = spark.read.csv("Du lieu cung cap\Fake-and-real-news-dataset\Fake.csv", inferSchema=True, sep=',' , header=True)

In [5]:
from pyspark.sql.functions import lit, concat, col, udf
from pyspark.sql.functions import isnan, when, count

In [6]:
data_true = data_true.withColumn("class", lit(1))
data_true.show(5)

+--------------------+--------------------+------------+------------------+-----+
|               title|                text|     subject|              date|class|
+--------------------+--------------------+------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |    1|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |    1|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |    1|
+--------------------+--------------------+------------+------------------+-----+
only showing top 5 rows



In [7]:
data_true.count()

21417

In [8]:
data_fake = data_fake.withColumn("class", lit(0))
data_fake.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|class|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



In [9]:
data_fake.count()

23489

In [10]:
data = data_true.unionAll(data_fake)
data.count()

44906

In [11]:
data.show()

+--------------------+--------------------+--------------------+------------------+-----+
|               title|                text|             subject|              date|class|
+--------------------+--------------------+--------------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|        politicsNews|December 31, 2017 |    1|
|U.S. military to ...|WASHINGTON (Reute...|        politicsNews|December 29, 2017 |    1|
|Senior U.S. Repub...|WASHINGTON (Reute...|        politicsNews|December 31, 2017 |    1|
|FBI Russia probe ...|WASHINGTON (Reute...|        politicsNews|December 30, 2017 |    1|
|Trump wants Posta...|SEATTLE/WASHINGTO...|        politicsNews|December 29, 2017 |    1|
|White House, Cong...|WEST PALM BEACH, ...|        politicsNews|December 29, 2017 |    1|
|Trump says Russia...|WEST PALM BEACH, ...|        politicsNews|December 29, 2017 |    1|
|Factbox: Trump on...|The following sta...|        politicsNews|December 29, 2017 |    1|
|Trump on 

In [12]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- class: integer (nullable = false)



### Clean and Prepare the Data

In [13]:
# Kiểm tra null
data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).toPandas().T

Unnamed: 0,0
title,0
text,8
subject,8
date,8
class,0


In [14]:
# Xóa dữ liệu null
data = data.na.drop()
data.count()

44898

In [15]:
# concat title and text
data = data.withColumn('title_text', concat(col('title'),lit('_'), col('text')))

In [16]:
data.show(5)

+--------------------+--------------------+------------+------------------+-----+--------------------+
|               title|                text|     subject|              date|class|          title_text|
+--------------------+--------------------+------------+------------------+-----+--------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|As U.S. budget fi...|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |    1|U.S. military to ...|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|Senior U.S. Repub...|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |    1|FBI Russia probe ...|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |    1|Trump wants Posta...|
+--------------------+--------------------+------------+------------------+-----+--------------------+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import length

In [18]:
data = data.withColumn('length',length(data['title_text']))

In [19]:
data.show(5)

+--------------------+--------------------+------------+------------------+-----+--------------------+------+
|               title|                text|     subject|              date|class|          title_text|length|
+--------------------+--------------------+------------+------------------+-----+--------------------+------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|As U.S. budget fi...|  4724|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |    1|U.S. military to ...|  4142|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    1|Senior U.S. Repub...|  2850|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |    1|FBI Russia probe ...|  2521|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |    1|Trump wants Posta...|  5274|
+--------------------+--------------------+------------+------------------+-----+--------------------+------+
only showi

In [20]:
data.groupby('class').mean().show()

+-----+----------+------------------+
|class|avg(class)|       avg(length)|
+-----+----------+------------------+
|    1|       1.0|2438.5372834664054|
|    0|       0.0|2571.2280567267153|
+-----+----------+------------------+



In [21]:
data = data.select(['title_text','class','length'])

### Feature Transformations

In [22]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
tokenizer = Tokenizer(inputCol="title_text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

In [23]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [24]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

### The Model NaiveBayes

In [25]:
from pyspark.ml.classification import NaiveBayes

In [26]:
# Use defaults
nb = NaiveBayes()

### Pipeline

In [27]:
from pyspark.ml import Pipeline

In [28]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,
                                  tokenizer,
                                  stopremove,
                                  count_vec,
                                  idf,
                                  clean_up])

In [29]:
cleaner = data_prep_pipe.fit(data)

In [30]:
clean_data = cleaner.transform(data)

### Training and Evaluation!

In [31]:
clean_data = clean_data.select(['label','features'])

In [32]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,3,4,...|
|  1.0|(262145,[0,1,3,4,...|
|  1.0|(262145,[2,3,7,10...|
+-----+--------------------+
only showing top 10 rows



In [33]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [34]:
fake_predictor = nb.fit(training)

In [35]:
data.printSchema()

root
 |-- title_text: string (nullable = true)
 |-- class: integer (nullable = false)
 |-- length: integer (nullable = true)



In [36]:
test_results = fake_predictor.transform(testing)

In [37]:
test_results.show(10)

+-----+--------------------+--------------------+-----------+----------+
|label|            features|       rawPrediction|probability|prediction|
+-----+--------------------+--------------------+-----------+----------+
|  1.0|(262145,[0,1,2,3,...|[-32146.980307036...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-28534.798244553...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-26562.906324341...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-28457.565959218...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-23902.471395510...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-26165.082756125...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-37710.906580411...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-22665.097681395...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-15866.967958237...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-16232.740012683...|  [0.0,1.0]|       1.0|
+-----+--------------------+--------------------+--

In [38]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 6332|
|  1.0|       0.0|   49|
|  0.0|       1.0|  171|
|  0.0|       0.0| 6939|
+-----+----------+-----+



In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [40]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting fake news was: {}".format(acc))

Accuracy of model at predicting fake news was: 0.9836994806845225


In [41]:
## Mô hình dự đoán có kết quả cao ~ 98%

## Logistic Regression 

In [42]:
from pyspark.ml.classification import  LogisticRegression

In [43]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [44]:
predictor_1 = lg.fit(training)

In [45]:
test_results_1 = predictor_1.transform(testing)

In [46]:
test_results_1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 6367|
|  1.0|       0.0|   14|
|  0.0|       1.0|   83|
|  0.0|       0.0| 7027|
+-----+----------+-----+



In [47]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_1)
print("Accuracy of model at predicting fake news was: {}".format(acc))

Accuracy of model at predicting fake news was: 0.9928118248165718


In [48]:
## Mô hình dự đoán có kết quả cao ~ 99%