In [1]:
!apt update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar -xvf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
import findspark
findspark.init()

[33m0% [Working][0m            Hit:1 http://security.ubuntu.com/ubuntu focal-security InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connected to cloud.r-pr[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu focal-updates InRelease
                                                                               Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
[33m0% [Waiting for headers] [Connected to cloud.r-project.org (65.9.86.12)] [Conne[0m                                                                               Hit:6 http://archive.ubuntu.com/ubuntu focal-backports I

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd '/content/gdrive/My Drive/LDS9/'

/content/gdrive/My Drive/LDS9


In [4]:
import findspark
findspark.init()

In [5]:
import pyspark

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('cau2').getOrCreate()

In [8]:
fake = spark.read.csv("fake-and-real-news-dataset/Fake.csv",inferSchema=True, header=True)

In [9]:
from pyspark.sql.functions import lit
fake = fake.withColumn("class",lit('fake'))

In [10]:
true = spark.read.csv("fake-and-real-news-dataset/True.csv",inferSchema=True, header=True)

In [11]:
from pyspark.sql.functions import lit
true = true.withColumn("class",lit('true'))

In [12]:
df = true.union(fake)
df.show(truncate=False)

+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
#select 2 cột cần
df = df.select(['class','text'])

### clean data

In [14]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [15]:
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
class,0
text,0


In [16]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
class,0
text,8


In [17]:
#số lượng dòng null ít nên ta drop luôn
df = df.dropna(how="any", subset=["text"])

In [18]:
from pyspark.sql.functions import length
df = df.withColumn('length',length(df['text']))

In [19]:
df.groupby('class').mean().show()

+-----+------------------+
|class|       avg(length)|
+-----+------------------+
| true| 2372.866507914274|
| fake|2476.0307482645544|
+-----+------------------+



### Feature Transformations

In [20]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
fake_true_to_num = StringIndexer(inputCol='class',outputCol='label')

In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [22]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],
 outputCol='features')

### thuật toán

In [23]:
from pyspark.ml.classification import NaiveBayes

In [24]:
nb = NaiveBayes()

### pipeline

In [25]:
from pyspark.ml import Pipeline

In [26]:
data_prep_pipe = Pipeline(stages=[fake_true_to_num,
 tokenizer,
 stopremove,
 count_vec,
 idf,
 clean_up])

In [27]:
cleaner = data_prep_pipe.fit(df)

In [28]:
clean_data = cleaner.transform(df)


### Training and Evaluation!

In [29]:
clean_data = clean_data.select(['label','features'])

In [30]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[0,1,2,3,...|
|  1.0|(262145,[1,2,3,4,...|
|  1.0|(262145,[0,1,3,4,...|
|  1.0|(262145,[0,3,4,7,...|
|  1.0|(262145,[2,3,7,11...|
+-----+--------------------+
only showing top 10 rows



In [31]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [32]:
fake_predictor = nb.fit(training)

In [33]:
test_results = fake_predictor.transform(testing)

In [34]:
test_results.show(10)

+-----+--------------------+--------------------+-----------+----------+
|label|            features|       rawPrediction|probability|prediction|
+-----+--------------------+--------------------+-----------+----------+
|  1.0|(262145,[0,1,2,3,...|[-28601.139384559...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-45395.329228463...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-33501.941258961...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-34502.162250968...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-29563.726578407...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-25836.502210834...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-22927.274778115...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-25741.586223776...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-26242.228394680...|  [0.0,1.0]|       1.0|
|  1.0|(262145,[0,1,2,3,...|[-34961.549325950...|  [0.0,1.0]|       1.0|
+-----+--------------------+--------------------+--

In [35]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 6319|
|  1.0|       0.0|   45|
|  0.0|       1.0|  164|
|  0.0|       0.0| 7011|
+-----+----------+-----+



In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [37]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting fake was: {}".format(acc))

Accuracy of model at predicting fake was: 0.9845700670211968


thuật toán naive bayes đoán khá chính xác

### sử dụng random forest

In [54]:
from pyspark.ml.classification import LinearSVC
LinearSVC_classifier = LinearSVC(labelCol="label", maxIter=50)

In [55]:
fake_predictor = LinearSVC_classifier.fit(training)

In [56]:
test_results = fake_predictor.transform(testing)

In [57]:
test_results.show(10)

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  1.0|(262145,[0,1,2,3,...|[6.21094190306376...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[1.06525454119501...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[1.47649427896008...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[4.65248243134752...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[3.49845185026153...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[5.22877763998388...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[-5606036.1218384...|       1.0|
|  1.0|(262145,[0,1,2,3,...|[2.31473820466824...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[4.27825186582687...|       0.0|
|  1.0|(262145,[0,1,2,3,...|[6.09289824271324...|       0.0|
+-----+--------------------+--------------------+----------+
only showing top 10 rows



In [58]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 5259|
|  1.0|       0.0| 1105|
|  0.0|       1.0| 2726|
|  0.0|       0.0| 4449|
+-----+----------+-----+



In [59]:
acc_eval = MulticlassClassificationEvaluator()
test_results = test_results.withColumn("prediction", test_results["prediction"].cast('double'))
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting fake was: {}".format(acc))

Accuracy of model at predicting fake was: 0.7150055380174876


thuật toán linearSVC tỷ lệ chính xác cũng tương đối

