In [1]:
# heavylifting functions are abstracted and modularized in utils package
from utils.fetch_tweet import TweetFetcher
from utils.analyze_sentiment import SentimentAnalyzer
from utils.spark_initializer import SparkInitializer
SparkInitializer.init_spark()
from pyspark.sql.functions import from_json, col, udf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, MapType, StringType, FloatType

In [2]:
spark = SparkSession.builder.appName('TradeWar').getOrCreate()

#### retrieve data source

In [21]:
topic = ['china', 'trade']
media = ['marketwatch', 'wsj', 'ft', 'business', 'theeconomist', 'cnbc', 'cnn']

In [22]:
media_tweets = TweetFetcher(media, topic).generate_tweets()

In [25]:
# retrieved json format tweets
media_tweets[0]

{'created_at': 'Sun Dec 15 21:39:37 +0000 2019',
 'id': 1206327979296612353,
 'id_str': '1206327979296612353',
 'text': 'China says it will put off tariff hike on U.S. autos, other goods following trade deal https://t.co/DrJ6G3o8TB',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/DrJ6G3o8TB',
    'expanded_url': 'https://on.mktw.net/2Po956z',
    'display_url': 'on.mktw.net/2Po956z',
    'indices': [87, 110]}]},
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'source': '<a href="http://www.socialflow.com" rel="nofollow">SocialFlow</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 624413,
  'id_str': '624413',
  'name': 'MarketWatch',
  'screen_name': 'MarketWatch',
  'location': '',
  'description': 'News, personal finance & commentary from MarketWatch.',

#### manipulate data using Spark

In [41]:
# as shown in the sample tweet, user is in nested json. MapType is the best choice to StructType it
tweet_schema = StructType([StructField('created_at', StringType(), True),
                           StructField('user', MapType(StringType(),StringType()), True),
                           StructField('text', StringType(), True),
                          ])

In [28]:
sdf = spark.createDataFrame(media_tweets, schema=tweet_schema)

In [29]:
sdf.show(10)

+--------------------+--------------------+--------------------+
|          created_at|                user|                text|
+--------------------+--------------------+--------------------+
|Sun Dec 15 21:39:...|[utc_offset ->, f...|China says it wil...|
|Sun Dec 15 20:40:...|[utc_offset ->, f...|So many of the br...|
|Sun Dec 15 18:44:...|[utc_offset ->, f...|China's exports t...|
|Sun Dec 15 18:29:...|[utc_offset ->, f...|That a deal exist...|
|Sun Dec 15 17:24:...|[utc_offset ->, f...|U.S. Trade Repres...|
|Sun Dec 15 17:15:...|[utc_offset ->, f...|China calls off a...|
|Sun Dec 15 17:12:...|[utc_offset ->, f...|U.S.T.R Lighthize...|
|Sun Dec 15 16:52:...|[utc_offset ->, f...|RT @emma_newburge...|
|Sun Dec 15 15:23:...|[utc_offset ->, f...|Even though it wa...|
|Sun Dec 15 06:12:...|[utc_offset ->, f...|China's share of ...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [42]:
# flatten the user column
user_schema = StructType([StructField('id_str', StringType(), True),
                           StructField('name', StringType(), True),
                           StructField('screen_name', StringType(), True),
                          ])

In [43]:
sdf.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- user: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- text: string (nullable = true)



In [32]:
sdf.select(col('user')['name']).show(10,False)

+-------------+
|user[name]   |
+-------------+
|MarketWatch  |
|CNBC         |
|The Economist|
|The Economist|
|Bloomberg    |
|CNN          |
|CNBC         |
|CNBC         |
|Bloomberg    |
|The Economist|
+-------------+
only showing top 10 rows



In [33]:
sdf1 = sdf.withColumn('name', col('user')['name'])

In [34]:
sdf2 = sdf1.select(col('name'),col('text'),col('created_at'))

In [35]:
sdf2.show(10)

+-------------+--------------------+--------------------+
|         name|                text|          created_at|
+-------------+--------------------+--------------------+
|  MarketWatch|China says it wil...|Sun Dec 15 21:39:...|
|         CNBC|So many of the br...|Sun Dec 15 20:40:...|
|The Economist|China's exports t...|Sun Dec 15 18:44:...|
|The Economist|That a deal exist...|Sun Dec 15 18:29:...|
|    Bloomberg|U.S. Trade Repres...|Sun Dec 15 17:24:...|
|          CNN|China calls off a...|Sun Dec 15 17:15:...|
|         CNBC|U.S.T.R Lighthize...|Sun Dec 15 17:12:...|
|         CNBC|RT @emma_newburge...|Sun Dec 15 16:52:...|
|    Bloomberg|Even though it wa...|Sun Dec 15 15:23:...|
|The Economist|China's share of ...|Sun Dec 15 06:12:...|
+-------------+--------------------+--------------------+
only showing top 10 rows



In [36]:
# using customized sentiment analyzer class
sa = SentimentAnalyzer()

In [37]:
# transform a customized function into Spark User-Defined-Function
udf_sentscore = udf(sa.sentiment_score, FloatType())

In [44]:
# get sentiment score of the tweet
sdf3 = sdf2.withColumn('sentiment_score', udf_sentscore(col('text')))

In [45]:
sdf3.show(10)

+-------------+--------------------+--------------------+---------------+
|         name|                text|          created_at|sentiment_score|
+-------------+--------------------+--------------------+---------------+
|  MarketWatch|China says it wil...|Sun Dec 15 21:39:...|            0.0|
|         CNBC|So many of the br...|Sun Dec 15 20:40:...|            0.0|
|The Economist|China's exports t...|Sun Dec 15 18:44:...|            0.0|
|The Economist|That a deal exist...|Sun Dec 15 18:29:...|         0.7003|
|    Bloomberg|U.S. Trade Repres...|Sun Dec 15 17:24:...|            0.0|
|          CNN|China calls off a...|Sun Dec 15 17:15:...|            0.0|
|         CNBC|U.S.T.R Lighthize...|Sun Dec 15 17:12:...|            0.0|
|         CNBC|RT @emma_newburge...|Sun Dec 15 16:52:...|         0.1027|
|    Bloomberg|Even though it wa...|Sun Dec 15 15:23:...|         0.3818|
|The Economist|China's share of ...|Sun Dec 15 06:12:...|          0.296|
+-------------+--------------------+--

In [46]:
sdf3.groupby('name').mean('sentiment_score').alias('avg_sentiment').show()

+--------------------+--------------------+
|                name|avg(sentiment_score)|
+--------------------+--------------------+
|The Wall Street J...|-0.05000000115897921|
|         MarketWatch|-0.01311578660419...|
|     Financial Times| 0.07606666535139084|
|           Bloomberg|0.031371830739605595|
|                 CNN| 0.16906923055648804|
|                CNBC|0.059080262129244054|
|       The Economist|-0.12496363574808295|
+--------------------+--------------------+

