# <center> **502 Project** </center>
### <center> Team member: Chenxi Liu, Nuo Tian, Mary Yu, Yuan Liu </center>

### Data Selection: 
#### Yahoo News Data Set (Part 1 and 2of 35) 

### STEP 1 : Data Extracting and Processing 

In [12]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder \
    .appName("project")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.4")\
    .config("spark.sql.broadcastTimeout", "36000")\
    .getOrCreate()

sc = spark.sparkContext
sc

In [13]:
spark

#### 1. Build Schema Using the README Instruction of the Data Set 
#### 

In [14]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('words')

from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [16]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("article_type", StringType(), True),
    StructField("np1", StringType(), True),
    StructField("np2", StringType(), True),
    StructField("context", StringType(), True),
    StructField("source", StringType(), True),
    StructField("category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("time", StringType(), True),])

#### 2. Readin the Data Set 


In [17]:
df = spark.read.csv("s3://anly502project/data/part-r-00000",sep = "\t",header=False,schema=schema)
df_2 = spark.read.csv("s3://anly502project/data/part-r-00001",sep = "\t",header=False,schema=schema)

#### 3. Verify the Schema 
#### 

In [18]:
#### Data Schema
df.printSchema()
df_2.printSchema()

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)



#### 4. Combine the Data Set
#### 

In [19]:
import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [20]:
unioned_df = unionAll([df, df_2])

In [21]:
#### show combined
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|article_type|            np1|np2|             context|       source|            category|            location| time|
+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             |intlnews topstor ...|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             |    politics topstor|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |      sports topstor| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...| 

#### 5. Select the very top category by using multiple split 
#### 

In [22]:
from pyspark.sql.functions import split
split_col = split(unioned_df['category'], ',')
unioned_df = unioned_df.withColumn('category', split_col.getItem(0))

split_col_2 = split(unioned_df['category'], ' ')
unioned_df = unioned_df.withColumn('category', split_col_2.getItem(0))

split_col_3 = split(unioned_df['category'], '_')
unioned_df = unioned_df.withColumn('category', split_col_3.getItem(0))

split_col_4 = split(unioned_df['category'], '-')
unioned_df = unioned_df.withColumn('category', split_col_4.getItem(0))

In [23]:
#### Show data frame after filtering the category
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|article_type|            np1|np2|             context|       source| category|            location| time|
+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             | intlnews|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             | politics|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |   sports| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...|             |  topstor| canton, ohio uni...|14363|
|     article| 75-minute mark|  E|arg

In [24]:
from pyspark.sql.functions import concat, col, lit

unioned_df1 = unioned_df.select(concat(col("np1"), lit(' '),col("context")).alias('context1'))
unioned_df1.show()

+--------------------+
|            context1|
+--------------------+
|Dark Knight arg1 ...|
|Carotenoids arg1 ...|
|Communities arg1 ...|
|Carotenoids arg1 ...|
|Coast bias arg2 i...|
|Commerce office a...|
|75-minute mark ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Drill Sergeant ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Cold arg2 Cape co...|
|Cook arg1 Off is ...|
|Boob arg1 How my ...|
|2002 arg2 Street ...|
|Boob arg1 How my ...|
+--------------------+
only showing top 20 rows



In [25]:
from pyspark.sql.functions import monotonically_increasing_id

df1 = unioned_df.withColumn("id", monotonically_increasing_id())

df2 = unioned_df1.withColumn("id", monotonically_increasing_id())


df3 = df2.join(df1, "id", "outer").drop("id")

df3.show()

+--------------------+------------+--------------------+---+--------------------+--------------------+-------------+--------------------+-----+
|            context1|article_type|                 np1|np2|             context|              source|     category|            location| time|
+--------------------+------------+--------------------+---+--------------------+--------------------+-------------+--------------------+-----+
|Dane arg1 gain In...|     article|                Dane|  E|arg1 gain In Grou...|                    |     intlnews|                   ,|14223|
|Dubai arg2 arlier...|            |               Dubai|  E|arg2 arlier in th...|                    |    localnews| nampa, idaho uni...|13892|
|Circle arg2 past ...|     article|              Circle|  E|arg2 past Brigham...|                    |      topstor| boston, massachu...|14434|
|Champions League ...|     article|Champions League ...|  E|  arg1 in Group arg2|Yahoo! UK & Irela...| nationalnews|                   ,

#### 6. Category (label) insight 
#### 

In [26]:
#### Filter out the null and empty category 
df3 =df3.filter(df3.category.isNotNull())
df3 = df3.filter(df3.category != '')

In [27]:
import pyspark.sql as sql
count_df = df3.groupBy("category").count()
count_df.createOrReplaceTempView("count_df")
count_rank_df = spark.sql("SELECT category, count FROM count_df ORDER BY count DESC LIMIT 15")

#### 7. Using the SQL to filter the category (label)
#### 

In [28]:
count_rank_df.show(10)

+-------------+--------+
|     category|   count|
+-------------+--------+
|      topstor|85027590|
|    localnews|40174779|
|       sports|22925153|
|     business|20017107|
| nationalnews| 5073191|
|     intlnews| 4252106|
|    technolog| 2546612|
|entertainment| 2359067|
|     politics| 2092784|
|     lifestle| 1785076|
+-------------+--------+
only showing top 10 rows



In [29]:
df3.createOrReplaceTempView("unioned_df")
count_rank_df.createOrReplaceTempView("count_rank_df")
df_final = spark.sql("SELECT * FROM unioned_df WHERE unioned_df.category IN (SELECT category FROM count_rank_df)")

In [30]:
df_final.show(10)

+--------------------+------------+--------------------+---+--------------------+--------------------+------------+--------------------+-----+
|            context1|article_type|                 np1|np2|             context|              source|    category|            location| time|
+--------------------+------------+--------------------+---+--------------------+--------------------+------------+--------------------+-----+
|Dane arg1 gain In...|     article|                Dane|  E|arg1 gain In Grou...|                    |    intlnews|                   ,|14223|
|Dubai arg2 arlier...|            |               Dubai|  E|arg2 arlier in th...|                    |   localnews| nampa, idaho uni...|13892|
|Circle arg2 past ...|     article|              Circle|  E|arg2 past Brigham...|                    |     topstor| boston, massachu...|14434|
|Champions League ...|     article|Champions League ...|  E|  arg1 in Group arg2|Yahoo! UK & Irela...|nationalnews|                   ,|14119|

#### 7. Filter the dataset by categories 
#### 

In [31]:
categories = [
 'politics',
'science',
'health',
'technolog',
'entertainment']

In [32]:
df_final = df_final.filter(col('category').isin(categories))

In [33]:
df_final.show(10)

+--------------------+------------+--------------------+---+--------------------+--------------------+-------------+--------------------+-----+
|            context1|article_type|                 np1|np2|             context|              source|     category|            location| time|
+--------------------+------------+--------------------+---+--------------------+--------------------+-------------+--------------------+-----+
|Canada arg1 repre...|     article|              Canada|  E|arg1 represents D...|New Orleans Times...|     politics| new orleans, lou...|14288|
|Barrel arg1 Blast...|            |              Barrel|  E|arg1 Blast is rat...|                    |entertainment|    ,  united states|13794|
| 0157 arg2 coli arg1|            |                0157|  E|      arg2 coli arg1|                    |       health|                   ,|13840|
|ASH arg1 was rate...|     article|                 ASH|  E| arg1 was rated arg2|                    |entertainment|    ,  united states

In [34]:
df_final.createOrReplaceTempView("df4")
data = spark.sql('select category,concat(np1,"\n",context1) as text from df4')
print(type(data))
data.printSchema()
#data.show(10)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- category: string (nullable = true)
 |-- text: string (nullable = true)



In [35]:
from pyspark.ml.feature import StringIndexer, IndexToString

indexer = StringIndexer(inputCol="category", outputCol="label")

In [36]:
si_label_fit = StringIndexer(inputCol ='category', outputCol ="label").fit(data).transform(data)

In [37]:
si_label_fit.show()

+-------------+--------------------+-----+
|     category|                text|label|
+-------------+--------------------+-----+
|     politics|Canada
Canada arg...|  2.0|
|entertainment|Barrel
Barrel arg...|  1.0|
|       health|0157
0157 arg2 co...|  3.0|
|entertainment|ASH
ASH arg1 was ...|  1.0|
|entertainment|Comedy
Comedy arg...|  1.0|
|       health|30
30 arg1 times ...|  3.0|
|entertainment|Coroner
Coroner a...|  1.0|
|entertainment|Alan Black
Alan B...|  1.0|
|entertainment|Beach
Beach arg2 ...|  1.0|
|    technolog|Chief Operating O...|  0.0|
|     politics|Conference
Confer...|  2.0|
|     politics|Cabinet Office
Ca...|  2.0|
|entertainment|Blender
Blender a...|  1.0|
|     politics|Commons
Commons a...|  2.0|
|entertainment|Beverly Hills hom...|  1.0|
|entertainment|Christmas gift
Ch...|  1.0|
|entertainment|Brandeis
Brandeis...|  1.0|
|entertainment|Chemical Brothers...|  1.0|
|entertainment|Hunter
Hunter arg...|  1.0|
|entertainment|News
News arg2 to...|  1.0|
+----------

#### 8. Build the pipeline
#### 

In [38]:
#labelConverter = IndexToString(inputCol='category', outputCol='label', labels = si_label_fit.labels)

In [39]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])

In [40]:
processed = nlp_pipeline.fit(si_label_fit).transform(si_label_fit)

In [41]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)

In [42]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
tf = spark_ft.CountVectorizer(vocabSize=500, inputCol='clean_tokens', outputCol='tf')
idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf')

feature_pipeline = Pipeline(stages=[sw_remover, tf, idf])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
#train_featurized.count()
train_featurized.show()

+-------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|     category|                text|label|             ntokens|        clean_tokens|                  tf|                 idf|
+-------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|entertainment|%
% arg1 of Oscar...|  1.0|[arg, of, oscar, ...|   [arg, oscar, arg]|     (500,[0],[2.0])|     (500,[0],[0.0])|
|entertainment|*Results
*Results...|  1.0|[result, result, ...|[result, result, ...|(500,[0,183,336],...|(500,[0,183,336],...|
|entertainment|---
--- arg1 with...|  1.0|[arg, with, it, arg]|          [arg, arg]|     (500,[0],[2.0])|     (500,[0],[0.0])|
|entertainment|.... Former Presi...|  1.0|[former, presid, ...|[former, presid, ...|(500,[0,5,26,107,...|(500,[0,5,26,107,...|
|entertainment|.com
.com arg2 on...|  1.0|[com, com, arg, o...|[com, com, arg, p...|     (500,[0],[2.0])|     (

In [43]:
train_featurized.printSchema()

root
 |-- category: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = false)
 |-- ntokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clean_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- idf: vector (nullable = true)



#### 9. Fit the model
#### 

In [44]:
from pyspark.ml import classification as spark_cls

rf = spark_cls.RandomForestClassifier(labelCol="label", featuresCol="idf", numTrees=100)

model = rf.fit(train_featurized)

In [45]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show(10)

+-------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     category|                text|label|             ntokens|        clean_tokens|                  tf|                 idf|       rawPrediction|         probability|prediction|
+-------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|entertainment|% guarantee
% gua...|  1.0|[guarante, guaran...|[guarante, guaran...|     (500,[0],[2.0])|     (500,[0],[0.0])|[29.6923665101603...|[0.29692366510160...|       0.0|
|entertainment|(909
(909 arg2 Ca...|  1.0|[arg, carolipio, ...|[arg, carolipio, ...|(500,[0,298],[2.0...|(500,[0,298],[0.0...|[29.6923665101603...|[0.29692366510160...|       0.0|
|entertainment|(916
(916 arg2 Mo...|  1.0|[arg, moor, at, arg]|    [arg, moor, arg]|     (500,[0],[2

In [46]:
pred_df = preds.select('text', 'label', 'prediction').toPandas()

In [47]:
pred_df

Unnamed: 0,text,label,prediction
0,% guarantee\n% guarantee arg1 a specific depar...,1.0,0.0
1,(909\n(909 arg2 Carolipio can be reached at arg1,1.0,0.0
2,(916\n(916 arg2 Moore at arg1,1.0,0.0
3,.com\n.com arg2 tells Usmagazine arg1,1.0,0.0
4,.net Cheap Africa phone card\n.net Cheap Afric...,1.0,0.0
...,...,...,...
2552316,users\nusers arg1 who test the fixes on arg2,0.0,0.0
2552317,users\nusers arg2 which may seem confusing to ...,0.0,0.0
2552318,victim\nvictim arg1 of domestic arg2,0.0,0.0
2552319,viewers\nviewers arg2 of options to our arg1,0.0,0.0


In [48]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(preds)

print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.342433
Test Error = 0.657567
