## Big Data Platforms - Winter 2023
## Final Project Education

### Twitter Analysis (Data Filtering)

#### Minh Vo

In [1]:
import os
import time
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import islice
# import sh
from pyspark.sql.functions import *
from pyspark.sql.types import *
from itertools import compress

pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')
warnings.filterwarnings(action='ignore')

In [2]:
from google.cloud import storage

In [3]:
path = "gs://msca-bdp-tweets/final_project"

In [4]:
!hadoop fs -ls "gs://msca-bdp-tweets/final_project" | tail -10

-rwx------   3 root root    9410904 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50685-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root   13046317 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50686-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root   10826130 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50687-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    9099590 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50688-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    9860829 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50689-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root   11562361 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50690-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    9132693 2023-02-08 13:57 gs://msca-bdp-tweets/final_project/part-50691-aa6d3cb4-7022-4df2-9921-2

### Data Loading

In [5]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

Load the collection of Twitter data, which consists of around 100 million Tweets (~500GB)

In [None]:
%%time
twitter_df = spark.read.json(path)
twitter_df.count()

23/03/09 06:43:10 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/03/09 06:43:25 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/03/09 06:44:35 WARN org.apache.spark.sql.execution.datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
23/03/09 06:50:56 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 2.84 s, sys: 525 ms, total: 3.36 s
Wall time: 12min 6s


                                                                                

99994342

In [None]:
#twitter_df.describe()

In [None]:
twitter_df.printSchema()

root
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |   

Due to the huge amount of data in the full dataset, I will try taking one file as an example to understand the data structures and conduct EDA

### Sample Data

In [14]:
# Load one json file as a sample to test
twitter_sample = spark.read.json('gs://msca-bdp-tweets/final_project/part-50692-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json')
twitter_sample.limit(5).toPandas()

Unnamed: 0,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,...,retweet_count,retweeted,retweeted_from,retweeted_status,source,text,timestamp_ms,truncated,tweet_text,user
0,,Sun Jan 01 05:05:44 +0000 2023,,"([([19, 43], धरती_को_स्वर्ग_बनाना_है)], None, [], [], [(1393144504924839937, 1393144504924839937, [3, 17], Abhisek Jantar Mantar, AbhisekJantar), (91851084, 91851084, [44, 59], Sant Rampal Ji Maharaj, SaintRampalJiM)])",,,0,False,low,,...,0,RT,AbhisekJantar,"(Sun Jan 01 01:53:30 +0000 2023, [0, 140], ([Row(indices=[0, 24], text='धरती_को_स्वर्ग_बनाना_है')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1609367179950174208', indices=[116, 139], url='https://t.co/mlNRpTNypS')], [Row(id=91851084, id_str='91851084', indices=[25, 40], name='Sant Rampal Ji Maharaj', screen_name='SaintRampalJiM')]), None, ([0, 262], ([Row(indices=[0, 24], text='धरती_को_स्वर्ग_बनाना_है')], [Row(additional_media_info=None, description=None, display_url='pic.twitter.com/dgY54f1Y6s', expanded_url='https://twitter.com/AbhisekJantar/status/1609367179950174208/photo/1', id=1609367170705940482, id_str='1609367170705940482', indices=[263, 286], media_url='http://pbs.twimg.com/media/FlWe6jsaYAIlV-r.jpg', media_url_https='https://pbs.twimg.com/media/FlWe6jsaYAIlV-r.jpg', sizes=Row(large=Row(h=1250, resize='fit', w=1000), medium=Row(h=1200, resize='fit', w=960), small=Row(h=680, resize='fit', w=544), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/dgY54f1Y6s', video_info=None)], [], [], [Row(id=91851084, id_str='91851084', indices=[25, 40], name='Sant Rampal Ji Maharaj', screen_name='SaintRampalJiM')]), ([Row(additional_media_info=None, description=None, display_url='pic.twitter.com/dgY54f1Y6s', expanded_url='https://twitter.com/AbhisekJantar/status/1609367179950174208/photo/1', id=1609367170705940482, id_str='1609367170705940482', indices=[263, 286], media_url='http://pbs.twimg.com/media/FlWe6jsaYAIlV-r.jpg', media_url_https='https://pbs.twimg.com/media/FlWe6jsaYAIlV-r.jpg', sizes=Row(large=Row(h=1250, resize='fit', w=1000), medium=Row(h=1200, resize='fit', w=960), small=Row(h=680, resize='fit', w=544), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/dgY54f1Y6s', video_info=None)],), #धरती_को_स्वर्ग_बनाना_है\n@SaintRampalJiM says, I can guarantee that if the school and college going youth are sent to my ashram for satsang even for one Sunday each month, then in a year's time they will give up on the western influences and dressing, and follow https://t.co/dgY54f1Y6s), 178, False, low, 1609367179950174208, 1609367179950174208, None, None, None, None, None, False, en, (([[[72.436739, 22.923256], [72.436739, 23.104662], [72.703725, 23.104662], [72.703725, 22.923256]]], Polygon), India, IN, Ahmadabad City, India, 272983f6b52c196e, Ahmadabad City, city, https://api.twitter.com/1.1/geo/id/272983f6b52c196e.json), False, 0, None, None, None, None, 7, 181, False, None, <a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>, #धरती_को_स्वर्ग_बनाना_है\n@SaintRampalJiM says, I can guarantee that if the school and college going youth are sent… https://t.co/mlNRpTNypS, True, (False, Fri May 14 10:02:20 +0000 2021, True, False, पूरब पश्चिम उत्तर दक्षिण, फ़िरता दाने दाने नूँ।\nसर्व कला सतगुरु साहेब की, हरि आये हरियाणे नूँ।।, 6635, 13236, 2565, True, 1393144504924839937, 1393144504924839937, False, 4, India, Abhisek Jantar Mantar, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1393144504924839937/1620987851, http://pbs.twimg.com/profile_images/1393549114269306884/UYPyPNVl_normal.jpg, https://pbs.twimg.com/profile_images/1393549114269306884/UYPyPNVl_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, AbhisekJantar, 5677, none, http://supremegod.org, False, []))","<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","RT @AbhisekJantar: #धरती_को_स्वर्ग_बनाना_है\n@SaintRampalJiM says, I can guarantee that if the school and college going youth are sent to my…",1672549544870,False,"#धरती_को_स्वर्ग_बनाना_है\n@SaintRampalJiM says, I can guarantee that if the school and college going youth are sent to my ashram for satsang even for one Sunday each month, then in a year's time they will give up on the western influences and dressing, and follow https://t.co/dgY54f1Y6s","(False, Thu Oct 21 03:26:33 +0000 2021, True, False, गरीब, सेवक होय कर उतरे, इस पृथ्वी के मांहि। जीव उधारण जगतगुरु, बार-बार बलि जांहि।।\n'LORD SANT RAMPAL JI MAHARAJ', 178045, 2671, 575, False, 1451026985963048966, 1451026985963048966, False, 0, Sonipat, Haryana, MANJEET ROHILLA //@ Follow Me, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1451026985963048966/1662986586, http://pbs.twimg.com/profile_images/1564178018897371136/vXOgHXrw_normal.jpg, https://pbs.twimg.com/profile_images/1564178018897371136/vXOgHXrw_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, MANJEET95715329, 184004, none, None, False, [])"
1,,Sun Jan 01 05:05:44 +0000 2023,"[17, 38]","([], [(None, None, pic.twitter.com/jlKoje7cFG, https://twitter.com/ko_kayi/status/1609415556373041152/photo/1, 1609415533123997696, 1609415533123997696, [39, 62], http://pbs.twimg.com/tweet_video_thumb/FlXK5nvaEAANufR.jpg, https://pbs.twimg.com/tweet_video_thumb/FlXK5nvaEAANufR.jpg, Row(large=Row(h=288, resize='fit', w=500), medium=Row(h=288, resize='fit', w=500), small=Row(h=288, resize='fit', w=500), thumb=Row(h=150, resize='crop', w=150)), None, None, None, None, photo, https://t.co/jlKoje7cFG)], [], [], [(3247181346, 3247181346, [0, 16], SoFloMan, Iamtherealpiman)])","([(None, None, pic.twitter.com/jlKoje7cFG, https://twitter.com/ko_kayi/status/1609415556373041152/photo/1, 1609415533123997696, 1609415533123997696, [39, 62], http://pbs.twimg.com/tweet_video_thumb/FlXK5nvaEAANufR.jpg, https://pbs.twimg.com/tweet_video_thumb/FlXK5nvaEAANufR.jpg, Row(large=Row(h=288, resize='fit', w=500), medium=Row(h=288, resize='fit', w=500), small=Row(h=288, resize='fit', w=500), thumb=Row(h=150, resize='crop', w=150)), None, None, None, None, animated_gif, https://t.co/jlKoje7cFG, Row(aspect_ratio=[125, 72], duration_millis=None, variants=[Row(bitrate=0, content_type='video/mp4', url='https://video.twimg.com/tweet_video/FlXK5nvaEAANufR.mp4')]))],)",,0,False,low,,...,0,,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@Iamtherealpiman Let’s go old school 🤭 https://t.co/jlKoje7cFG,1672549544791,False,@Iamtherealpiman Let’s go old school 🤭 https://t.co/jlKoje7cFG,"(False, Tue May 03 13:22:16 +0000 2022, True, False, None, 117, 262, 114, False, 1521478992506761216, 1521478992506761216, False, 0, None, Kokai, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1521478992506761216/1670420415, http://pbs.twimg.com/profile_images/1600485333073727488/IzYIUnH9_normal.jpg, https://pbs.twimg.com/profile_images/1600485333073727488/IzYIUnH9_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, ko_kayi, 209, none, None, False, [])"
2,,Sun Jan 01 05:05:45 +0000 2023,,"([], None, [], [], [])",,,0,False,low,,...,0,,,,"<a href=""https://cheapbotsdonequick.com"" rel=""nofollow"">Cheap Bots, Done Quick!</a>","What shall we talk about, professor? I've no opinion on tea so long as it's potable, but expensive leaves do pique my interest.",1672549545261,False,"What shall we talk about, professor? I've no opinion on tea so long as it's potable, but expensive leaves do pique my interest.","(False, Sun Feb 06 07:41:06 +0000 2022, True, False, A Linhardt quote bot, tweets every 30 minutes. Not spoiler-free. Does not respond., 0, 74, 6, False, 1490228974793871361, 1490228974793871361, False, 2, lovingly made by Rex♛✎ more →, Linhardt von Hevring, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1490228974793871361/1644133387, http://pbs.twimg.com/profile_images/1542691909725958144/AQc1_70v_normal.jpg, https://pbs.twimg.com/profile_images/1542691909725958144/AQc1_70v_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, Linhardtbot_, 15637, none, https://twitter.com/BylethBot/status/1542687049257365504?s=20&t=Ce9-SQoF3WJUq4bZv-8Zmw, False, [])"
3,,Sun Jan 01 05:05:45 +0000 2023,"[0, 140]","([], None, [], [(twitter.com/i/web/status/1…, https://twitter.com/i/web/status/1609415558050775040, [116, 139], https://t.co/GVXLlpDqw0)], [])",,"([0, 163], ([], [Row(additional_media_info=None, description=None, display_url='pic.twitter.com/65L9ODITNy', expanded_url='https://twitter.com/GeoffRhymer/status/1609415558050775040/photo/1', id=1609415553042505728, id_str='1609415553042505728', indices=[164, 187], media_url='http://pbs.twimg.com/media/FlXK6x8WYAACOgk.jpg', media_url_https='https://pbs.twimg.com/media/FlXK6x8WYAACOgk.jpg', sizes=Row(large=Row(h=900, resize='fit', w=1200), medium=Row(h=900, resize='fit', w=1200), small=Row(h=510, resize='fit', w=680), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/65L9ODITNy', video_info=None)], [], [], []), ([Row(additional_media_info=None, description=None, display_url='pic.twitter.com/65L9ODITNy', expanded_url='https://twitter.com/GeoffRhymer/status/1609415558050775040/photo/1', id=1609415553042505728, id_str='1609415553042505728', indices=[164, 187], media_url='http://pbs.twimg.com/media/FlXK6x8WYAACOgk.jpg', media_url_https='https://pbs.twimg.com/media/FlXK6x8WYAACOgk.jpg', sizes=Row(large=Row(h=900, resize='fit', w=1200), medium=Row(h=900, resize='fit', w=1200), small=Row(h=510, resize='fit', w=680), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/65L9ODITNy', video_info=None)],), Happy New Year 2023 YALL I Hope Everything You Inspire To Be Will Be FullFilled . Also This Is The Year I Will Be Graduating College!! \n\nI’m Claiming It!!! 🙏🏾🙏🏾😆💯 https://t.co/65L9ODITNy)",0,False,low,,...,0,,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Happy New Year 2023 YALL I Hope Everything You Inspire To Be Will Be FullFilled . Also This Is The Year I Will Be… https://t.co/GVXLlpDqw0,1672549545191,True,Happy New Year 2023 YALL I Hope Everything You Inspire To Be Will Be FullFilled . Also This Is The Year I Will Be Graduating College!! \n\nI’m Claiming It!!! 🙏🏾🙏🏾😆💯 https://t.co/65L9ODITNy,"(False, Fri Jul 20 16:06:47 +0000 2012, False, False, #VI 🇻🇮 #BVI 🇻🇬 #VirginIslands #BritishVirginIslands #WWE #Scorpio #BlackLivesMatter ✊🏾, 206967, 7280, 8008, True, 707389129, 707389129, False, 87, British Virgin Islands | ATL, 𝕲𝖊𝖔𝖋𝖋 💯, 131516, http://abs.twimg.com/images/themes/theme14/bg.gif, https://abs.twimg.com/images/themes/theme14/bg.gif, False, https://pbs.twimg.com/profile_banners/707389129/1577747732, http://pbs.twimg.com/profile_images/1588035133449314305/GtFCBkyo_normal.jpg, https://pbs.twimg.com/profile_images/1588035133449314305/GtFCBkyo_normal.jpg, 2196F3, EEEEEE, EFEFEF, 333333, True, False, GeoffRhymer, 222200, none, None, False, [])"
4,,Sun Jan 01 05:05:46 +0000 2023,,"([], None, [], [], [])",,,0,False,low,,...,0,,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",how do you even show up to school after missing such a huge kick that badly,1672549546335,False,how do you even show up to school after missing such a huge kick that badly,"(False, Wed Mar 11 17:06:45 +0000 2020, True, False, #ElTri | #LevelUp | #WeAreTexans | #Rockets | i love the houston astros, 59998, 454, 530, False, 1237787021000802305, 1237787021000802305, False, 1, Houston, TX, ral💫, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1237787021000802305/1646957116, http://pbs.twimg.com/profile_images/1539843884443901952/Tl520Dta_normal.jpg, https://pbs.twimg.com/profile_images/1539843884443901952/Tl520Dta_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, MillsMissed, 9116, none, None, False, [])"


In [None]:
twitter_sample.count()

3891

In [None]:
# Reference: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet
#twitter_sample.columns

In [None]:
twitter_sample.describe().show()

                                                                                

+-------+--------------------+--------------+------------+--------------------+--------------------+-----------------------+---------------------+-------------------------+--------------------+-----------------------+----+-----------+--------------------+--------------------+--------------------+-----------+-------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|          created_at|favorite_count|filter_level|                  id|              id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str| in_reply_to_user_id|in_reply_to_user_id_str|lang|quote_count|    quoted_status_id|quoted_status_id_str|         quoted_text|reply_count|retweet_count|retweeted|      retweeted_from|              source|                text|        timestamp_ms|          tweet_text|
+-------+--------------------+--------------+------------+--------------------+--------------------+----------------------

#### Filtering Sample

In [39]:
# Filter the sample data by selecting only relevant columns for analysis
## Focus on user and place columns since they have a number of child objects, so here I will only choose important related features

filtered_sample = twitter_sample.select(['coordinates','favorite_count','filter_level','in_reply_to_screen_name',\
                                         'retweeted','retweeted_from','retweeted_status','retweeted_status.retweet_count',\
                                         'text',\
                                         'place.country','place.full_name','place.place_type',\
                                         'timestamp_ms',\
                                         'user.id_str','user.name','user.screen_name','user.location','user.description','user.followers_count','user.statuses_count','user.created_at','user.verified',\
                                         'lang'\
                                        ])
filtered_sample.limit(5)

coordinates,favorite_count,filter_level,in_reply_to_screen_name,retweeted,retweeted_from,retweeted_status,retweet_count,text,country,full_name,place_type,timestamp_ms,id_str,name,screen_name,location,description,followers_count,statuses_count,created_at,verified,lang
,0,low,,RT,AbhisekJantar,{Sun Jan 01 01:53...,181.0,RT @AbhisekJantar...,,,,1672549544870,1451026985963048966,MANJEET ROHILLA /...,MANJEET95715329,"Sonipat, Haryana","गरीब, सेवक होय कर...",2671,184004,Thu Oct 21 03:26:...,False,en
,0,low,Iamtherealpiman,,,,,@Iamtherealpiman ...,,,,1672549544791,1521478992506761216,Kokai,ko_kayi,,,262,209,Tue May 03 13:22:...,False,en
,0,low,,,,,,What shall we tal...,,,,1672549545261,1490228974793871361,Linhardt von Hevring,Linhardtbot_,lovingly made by ...,A Linhardt quote ...,74,15637,Sun Feb 06 07:41:...,False,en
,0,low,,,,,,Happy New Year 20...,,,,1672549545191,707389129,𝕲𝖊𝖔𝖋𝖋 💯,GeoffRhymer,British Virgin Is...,#VI 🇻🇮 #BVI 🇻?...,7280,222200,Fri Jul 20 16:06:...,False,en
,0,low,,,,,,how do you even s...,,,,1672549546335,1237787021000802305,ral💫,MillsMissed,"Houston, TX",#ElTri | #LevelUp...,454,9116,Wed Mar 11 17:06:...,False,en


In [9]:
# create a list of words related to education
edu_words = ['education','K-12','teachers','professors','students','university','universities','college','colleges','schools','curriculum']

In [33]:
# From the filtered sample above, do another filter to identify education-related tweets only

filtered_sample2 = filtered_sample.filter((filtered_sample.text.rlike('|'.join(edu_words))) & (filtered_sample.lang == 'en'))
filtered_sample2.limit(10)

                                                                                

coordinates,favorite_count,filter_level,in_reply_to_screen_name,retweeted,retweeted_from,retweeted_status,retweet_count,text,country,full_name,place_type,timestamp_ms,id_str,name,screen_name,location,description,followers_count,statuses_count,created_at,verified,lang
,0,low,,,,,,This is why I say...,,,,1662347999477,427393794,New Dawn,imageofanewdawn,"Buffalo, NY",making you look b...,644,56512,Sat Dec 03 13:34:...,False,en
,0,low,SheriShannon27,,,,,I didn’t even kno...,,,,1662347999786,1927596638,It’s ‘Shuh-ree’,SheriShannon27,"Chesterfield, VA",Director: @Shanno...,3147,19570,Wed Oct 02 17:27:...,False,en
,0,low,,,,,,Fire all the coll...,United States,"Rock Hill, SC",city,1662347999983,1341804844340965376,Zachary thompson,OGZACHO,"Rock Hill, SC","CLT living, gamec...",45,637,Wed Dec 23 17:56:...,False,en
,0,low,,,,,,I now love colleg...,,,,1662348000070,762004178977849344,Brandon,Brandon_Basniak,Cleveland,CSU grad 17’,346,26822,Sat Aug 06 19:15:...,False,en
,0,low,,RT,3YearLetterman,{Mon Sep 05 03:18...,52.0,RT @3YearLetterma...,,,,1662348000087,1433247287136997376,jakob gross,jakobgross01,,Follower of Chris...,31,384,Thu Sep 02 01:56:...,False,en
,0,low,,RT,DarenStoltzfus,{Mon Sep 05 03:19...,1.0,RT @DarenStoltzfu...,,,,1662348000126,1315274419,Jacob Morrison,jmorrison48,CHS ➡️ HTX,TWU DPT ‘24 UNCP ...,1960,18316,Fri Mar 29 21:24:...,False,en
,0,low,,,,,,How can you not l...,United States,"San Francisco, CA",city,1662348000259,304747760,Luis Wright,lululemonade0,"San Francisco, CA",San Francisco ‘18...,202,50379,Wed May 25 01:45:...,False,en
,0,low,,,,,,Gotta love colleg...,,,,1662348000767,3138639521,T.W. Arrighi,twarrighi,"Washington, DC",National Press Se...,1452,480,Sun Apr 05 22:27:...,False,en
,0,low,,,,,,Good ole goofy as...,,,,1662348000986,264533038,Nelson,NellyD15,"Mississippi, America",You can’t be play...,1439,49336,Sat Mar 12 02:08:...,False,en
,0,low,,,,,,I missed college ...,,,,1662348000998,528976747,Bob Vance,samdem95,Vance Refrigeration,I’m a five star man.,290,9027,Mon Mar 19 00:44:...,False,en


In [28]:
# Count the number of tweets in the sample after filtering
filtered_sample2.count()

1533

In [17]:
filtered_sample2.describe().show()



+-------+--------------+------------+-----------------------+---------+-------------+--------------------+---------------------------+-------------+-------------+----------+--------------------+------------+-----------------+--------------------+-----------------+-----------------+--------------------+----+
|summary|favorite_count|filter_level|in_reply_to_screen_name|retweeted|retweet_count|      retweeted_from|                       text|      country|         name|place_type|              id_str| screen_name|         location|         description|  followers_count|   statuses_count|          created_at|lang|
+-------+--------------+------------+-----------------------+---------+-------------+--------------------+---------------------------+-------------+-------------+----------+--------------------+------------+-----------------+--------------------+-----------------+-----------------+--------------------+----+
|  count|          1533|        1533|                    230|     1533|  

                                                                                

### Filtering the Original Data and save it

In [10]:
%%time
filtered_df = twitter_df.filter((twitter_df.text.rlike('|'.join(edu_words))) & (twitter_df.lang == 'en'))\
                        .select(['coordinates','favorite_count','filter_level','in_reply_to_screen_name',\
                                 'retweeted','retweet_count','retweeted_from','retweeted_status','text',\
                                 'place.country','place.country_code','place.full_name','place.place_type','place.bounding_box',\
                                 'timestamp_ms',\
                                 'user.id_str','user.name','user.screen_name','user.location','user.description','user.followers_count','user.statuses_count','user.created_at','user.verified',\
                                 'lang'])
filtered_df.limit(5)

CPU times: user 18.2 ms, sys: 2.82 ms, total: 21 ms
Wall time: 382 ms


                                                                                

coordinates,favorite_count,filter_level,in_reply_to_screen_name,retweeted,retweet_count,retweeted_from,retweeted_status,text,country,country_code,full_name,place_type,bounding_box,timestamp_ms,id_str,name,screen_name,location,description,followers_count,statuses_count,created_at,verified,lang
,0,low,,RT,0,ShehanJeyarajah,"{null, Sun Nov 06...",RT @ShehanJeyaraj...,,,,,,1667839183155,1230250705484685313,Crash Davis,Danny_Bigmac13,"Durham, NC",I believe in soul...,1116,19016,Wed Feb 19 22:00:...,False,en
,0,low,thotmommyshit,,0,,,@thotmommyshit Ek...,,,,,,1667839183215,1118777709616947200,fountain pen user...,aakashav333,hehehehehe,,133,2031,Thu Apr 18 07:26:...,False,en
,0,low,,RT,0,abuga_makori,"{null, Sun Nov 06...",RT @abuga_makori:...,,,,,,1667839183665,1553285294077579265,BVM,Miriti_BVM,,I am a Branded Me...,91,1017,Sat Jul 30 07:44:...,False,en
,0,low,,RT,0,AliefISD,"{null, Sun Nov 06...",RT @AliefISD: All...,,,,,,1667839186746,784772184283582465,Stephanie Hooks,artcaptainhooks,,I am an artist an...,37,93,Sat Oct 08 15:07:...,False,en
,0,low,,RT,0,kaydubblu,"{null, Mon Nov 07...",RT @kaydubblu: @k...,,,,,,1667839186954,758403501470822400,N B Beamer🇺🇦Pro...,shareitarie10,"Iowa, USA",Blue in a red sta...,12599,602267,Wed Jul 27 20:47:...,False,en


In [None]:
# Count the data volume in the original df after applying filters

filtered_df.count()

                                                                                

24721729

In [None]:
# Save the filtered original dataset for future use

save_path = 'gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project'
filtered_df.write.format('parquet').\
            mode('overwrite').\
            save(save_path)

                                                                                

In [None]:
!hadoop fs -ls "gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project" | head -7

Found 5742 items
-rwx------   3 root root          0 2023-03-09 07:30 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project/_SUCCESS
-rwx------   3 root root    2605946 2023-03-09 07:14 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project/part-00000-59df6c97-c835-4c6d-8159-1dfe42cf74f5-c000.snappy.parquet
-rwx------   3 root root    2855860 2023-03-09 07:14 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project/part-00001-59df6c97-c835-4c6d-8159-1dfe42cf74f5-c000.snappy.parquet
-rwx------   3 root root    2960462 2023-03-09 07:14 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project/part-00002-59df6c97-c835-4c6d-8159-1dfe42cf74f5-c000.snappy.parquet
-rwx------   3 root root    2707638 2023-03-09 07:14 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Project/part-00003-59df6c97-c835-4c6d-8159-1dfe42cf74f5-c000.snappy.parquet
-rwx------   3 root root    3077704 2023-03-09 07:14 gs://msca-bdp-students-bucket/shared_data/mdvo/BDP-Final-Pr