# <center> **502 Project** </center>
### <center> Team member: Chenxi Liu, Nuo Tian, Mengyu Liu, Yuan Liu </center>

### Data Selection: 
#### Yahoo News Data Set (Part 1 and 2of 35) 
#### This part of code is the extract data from s3 and data cleaning for future modeling



In [2]:
### initialize spark environment
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("SparkSession") \
     .getOrCreate()

sc = spark.sparkContext 
sc

In [118]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("article_type", StringType(), True),
    StructField("np1", StringType(), True),
    StructField("np2", StringType(), True),
    StructField("context", StringType(), True),
    StructField("source", StringType(), True),
    StructField("category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("time", StringType(), True),])

In [119]:
df = spark.read.csv("s3://anly502project/data/part-r-00080",sep = "\t",header=False,schema=schema)

In [124]:
#### Data Schema
df.printSchema()

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)



In [123]:
df.show(10)

+------------+------------+---+--------------------+------+--------------------+--------------------+-----+
|article_type|         np1|np2|             context|source|            category|            location| time|
+------------+------------+---+--------------------+------+--------------------+--------------------+-----+
|     article|0.86 percent|  1|arg2 basis point ...|      |             topstor|                   ,|14628|
|            |0.51 seconds|  1|arg2 minute and arg1|      |   localnews topstor| , rhode island u...|13901|
|     article|0.02 percent|  1|   arg2 cent or arg1|      |topstor,business,...| lakeland, florid...|14677|
|            |0.51 seconds|  1|arg2 minute and arg1|      |   localnews topstor| , rhode island u...|13902|
|     article|0.57 percent|  1|arg2 basis point ...|      |             topstor|                   ,|14746|
|     article|0.86 percent|  1|arg2 basis point ...|      |topstor,business,...|                   ,|14777|
|            |0.51 seconds| 

In [129]:
####Drop unrelated Column
df = df.drop("source").drop("article_type").drop("location").drop("time")

In [130]:
df.show(10)

+------------+---+--------------------+--------------------+
|         np1|np2|             context|            category|
+------------+---+--------------------+--------------------+
|0.86 percent|  1|arg2 basis point ...|             topstor|
|0.51 seconds|  1|arg2 minute and arg1|   localnews topstor|
|0.02 percent|  1|   arg2 cent or arg1|topstor,business,...|
|0.51 seconds|  1|arg2 minute and arg1|   localnews topstor|
|0.57 percent|  1|arg2 basis point ...|             topstor|
|0.86 percent|  1|arg2 basis point ...|topstor,business,...|
|0.51 seconds|  1|arg2 minute and arg1|   localnews topstor|
|0.02 percent|  1|   arg2 cent or arg1|             topstor|
|0.02 percent|  1|   arg2 cent or arg1|             topstor|
|0.86 percent|  1|arg2 basis point ...|topstor,business,...|
+------------+---+--------------------+--------------------+
only showing top 10 rows



In [134]:
from pyspark.sql.functions import split
split_col = split(df['category'], ',')
df = df.withColumn('category', split_col.getItem(0))

In [136]:
df.show(10)

+------------+---+--------------------+-----------------+
|         np1|np2|             context|         category|
+------------+---+--------------------+-----------------+
|0.86 percent|  1|arg2 basis point ...|          topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews topstor|
|0.02 percent|  1|   arg2 cent or arg1|          topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews topstor|
|0.57 percent|  1|arg2 basis point ...|          topstor|
|0.86 percent|  1|arg2 basis point ...|          topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews topstor|
|0.02 percent|  1|   arg2 cent or arg1|          topstor|
|0.02 percent|  1|   arg2 cent or arg1|          topstor|
|0.86 percent|  1|arg2 basis point ...|          topstor|
+------------+---+--------------------+-----------------+
only showing top 10 rows



In [142]:
split_col_2 = split(df['category'], ' ')
df = df.withColumn('category', split_col_2.getItem(0))

In [143]:
df.show(10)

+------------+---+--------------------+---------+
|         np1|np2|             context| category|
+------------+---+--------------------+---------+
|0.86 percent|  1|arg2 basis point ...|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.57 percent|  1|arg2 basis point ...|  topstor|
|0.86 percent|  1|arg2 basis point ...|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.86 percent|  1|arg2 basis point ...|  topstor|
+------------+---+--------------------+---------+
only showing top 10 rows



In [145]:
split_col_3 = split(df['category'], '_')
df = df.withColumn('category', split_col_3.getItem(0))

In [146]:
split_col_4 = split(df['category'], '-')
df = df.withColumn('category', split_col_4.getItem(0))

In [147]:
df.show(10)

+------------+---+--------------------+---------+
|         np1|np2|             context| category|
+------------+---+--------------------+---------+
|0.86 percent|  1|arg2 basis point ...|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.57 percent|  1|arg2 basis point ...|  topstor|
|0.86 percent|  1|arg2 basis point ...|  topstor|
|0.51 seconds|  1|arg2 minute and arg1|localnews|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.02 percent|  1|   arg2 cent or arg1|  topstor|
|0.86 percent|  1|arg2 basis point ...|  topstor|
+------------+---+--------------------+---------+
only showing top 10 rows



In [None]:
df.groupBy("category").count().orderBy(count).show()