In [1]:
# heavylifting functions are abstracted and modularized in utils package
from utils.fetch_tweet import TweetFetcher
from utils.analyze_sentiment import SentimentAnalyzer
from utils.spark_initializer import SparkInitializer
SparkInitializer.init_spark()
from pyspark.sql.functions import from_json, col, udf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, MapType, StringType, FloatType
from pprint import pprint

In [2]:
spark = SparkSession.builder.appName('TradeWar').getOrCreate()

#### retrieve data source

In [3]:
topic = ['china', 'trade']
media = ['marketwatch', 'wsj', 'ft', 'business', 'theeconomist', 'cnbc', 'cnn']

In [4]:
media_tweets = TweetFetcher(media, topic).generate_tweets()

In [5]:
# retrieved json format tweets
pprint(media_tweets[0])

{'contributors': None,
 'coordinates': None,
 'created_at': 'Mon Dec 16 04:48:11 +0000 2019',
 'entities': {'hashtags': [],
              'symbols': [],
              'urls': [{'display_url': 'cnb.cx/2PNdcbn',
                        'expanded_url': 'https://cnb.cx/2PNdcbn',
                        'indices': [79, 102],
                        'url': 'https://t.co/VdK3hHi1wZ'}],
              'user_mentions': []},
 'favorite_count': 34,
 'favorited': False,
 'geo': None,
 'id': 1206435831914909697,
 'id_str': '1206435831914909697',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'place': None,
 'possibly_sensitive': False,
 'retweet_count': 12,
 'retweeted': False,
 'source': '<a href="http://www.socialflow.com" rel="nofollow">SocialFlow</a>',
 'text': 'US and C

#### manipulate data using Spark

In [6]:
# as shown in the sample tweet, user is in nested json. MapType is the best choice to StructType it
tweet_schema = StructType([StructField('created_at', StringType(), True),
                           StructField('user', MapType(StringType(),StringType()), True),
                           StructField('text', StringType(), True),
                          ])

In [7]:
sdf = spark.createDataFrame(media_tweets, schema=tweet_schema)

In [8]:
sdf.show(10)

+--------------------+--------------------+--------------------+
|          created_at|                user|                text|
+--------------------+--------------------+--------------------+
|Mon Dec 16 04:48:...|[utc_offset ->, f...|US and China reac...|
|Mon Dec 16 04:32:...|[utc_offset ->, f...|The US-China trad...|
|Mon Dec 16 04:22:...|[utc_offset ->, f...|Asian markets lit...|
|Mon Dec 16 03:16:...|[utc_offset ->, f...|-U.S. and China a...|
|Mon Dec 16 03:09:...|[utc_offset ->, f...|-China’s offshore...|
|Sun Dec 15 21:39:...|[utc_offset ->, f...|China says it wil...|
|Sun Dec 15 20:40:...|[utc_offset ->, f...|So many of the br...|
|Sun Dec 15 18:44:...|[utc_offset ->, f...|China's exports t...|
|Sun Dec 15 18:29:...|[utc_offset ->, f...|That a deal exist...|
|Sun Dec 15 17:24:...|[utc_offset ->, f...|U.S. Trade Repres...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [9]:
# flatten the user column
user_schema = StructType([StructField('id_str', StringType(), True),
                           StructField('name', StringType(), True),
                           StructField('screen_name', StringType(), True),
                          ])

In [10]:
sdf.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- user: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- text: string (nullable = true)



In [11]:
sdf.select(col('user')['name']).show(10,False)

+-------------+
|user[name]   |
+-------------+
|CNBC         |
|CNBC         |
|MarketWatch  |
|Bloomberg    |
|Bloomberg    |
|MarketWatch  |
|CNBC         |
|The Economist|
|The Economist|
|Bloomberg    |
+-------------+
only showing top 10 rows



In [12]:
sdf1 = sdf.withColumn('name', col('user')['name'])

In [13]:
sdf2 = sdf1.select(col('name'),col('text'),col('created_at'))

In [14]:
sdf2.show(10)

+-------------+--------------------+--------------------+
|         name|                text|          created_at|
+-------------+--------------------+--------------------+
|         CNBC|US and China reac...|Mon Dec 16 04:48:...|
|         CNBC|The US-China trad...|Mon Dec 16 04:32:...|
|  MarketWatch|Asian markets lit...|Mon Dec 16 04:22:...|
|    Bloomberg|-U.S. and China a...|Mon Dec 16 03:16:...|
|    Bloomberg|-China’s offshore...|Mon Dec 16 03:09:...|
|  MarketWatch|China says it wil...|Sun Dec 15 21:39:...|
|         CNBC|So many of the br...|Sun Dec 15 20:40:...|
|The Economist|China's exports t...|Sun Dec 15 18:44:...|
|The Economist|That a deal exist...|Sun Dec 15 18:29:...|
|    Bloomberg|U.S. Trade Repres...|Sun Dec 15 17:24:...|
+-------------+--------------------+--------------------+
only showing top 10 rows



In [15]:
# using customized sentiment analyzer class
sa = SentimentAnalyzer()

In [16]:
# transform a customized function into Spark User-Defined-Function
udf_sentscore = udf(sa.sentiment_score, FloatType())

In [17]:
# get sentiment score of the tweet
sdf3 = sdf2.withColumn('sentiment_score', udf_sentscore(col('text')))

In [18]:
sdf3.show(10)

+-------------+--------------------+--------------------+---------------+
|         name|                text|          created_at|sentiment_score|
+-------------+--------------------+--------------------+---------------+
|         CNBC|US and China reac...|Mon Dec 16 04:48:...|         0.3818|
|         CNBC|The US-China trad...|Mon Dec 16 04:32:...|        -0.6369|
|  MarketWatch|Asian markets lit...|Mon Dec 16 04:22:...|            0.0|
|    Bloomberg|-U.S. and China a...|Mon Dec 16 03:16:...|         0.3612|
|    Bloomberg|-China’s offshore...|Mon Dec 16 03:09:...|         0.4767|
|  MarketWatch|China says it wil...|Sun Dec 15 21:39:...|            0.0|
|         CNBC|So many of the br...|Sun Dec 15 20:40:...|            0.0|
|The Economist|China's exports t...|Sun Dec 15 18:44:...|            0.0|
|The Economist|That a deal exist...|Sun Dec 15 18:29:...|         0.7003|
|    Bloomberg|U.S. Trade Repres...|Sun Dec 15 17:24:...|            0.0|
+-------------+--------------------+--

In [19]:
sdf3.groupby('name').mean('sentiment_score').alias('avg_sentiment').show()

+--------------------+--------------------+
|                name|avg(sentiment_score)|
+--------------------+--------------------+
|The Wall Street J...|-0.05000000115897921|
|         MarketWatch|-0.01277948438357...|
|     Financial Times| 0.07606666535139084|
|           Bloomberg|0.042573611045049295|
|                 CNN| 0.16906923055648804|
|                CNBC|   0.054294870640987|
|       The Economist|-0.12496363574808295|
+--------------------+--------------------+

