In [1]:
import os
import subprocess
import shutil
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import time
import pyspark
import re
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import CountVectorizer,  IDF, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')
from itertools import compress 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')

In [2]:
spark = SparkSession.builder.appName("ReadJSONFile").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 10000)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/10 14:22:12 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/03/10 14:22:12 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/03/10 14:22:12 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/03/10 14:22:13 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
# check bucket to save files
!hadoop fs -ls 'gs://msca-bdp-students-bucket/shared_data/danilm/' 

Found 1 items
drwx------   - root root          0 2023-02-09 02:39 gs://msca-bdp-students-bucket/shared_data/danilm/temp


In [4]:
# investigated how many files are in the directory, and return first 5 files
!hadoop fs -ls 'gs://msca-bdp-tweets/final_project/' | head -n 6

Found 50696 items
-rwx------   3 root root          0 2023-02-08 13:58 gs://msca-bdp-tweets/final_project/_SUCCESS
-rwx------   3 root root    4500466 2023-02-08 13:44 gs://msca-bdp-tweets/final_project/part-00000-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    4107431 2023-02-08 13:44 gs://msca-bdp-tweets/final_project/part-00001-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    4672123 2023-02-08 13:44 gs://msca-bdp-tweets/final_project/part-00002-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json
-rwx------   3 root root    5186684 2023-02-08 13:44 gs://msca-bdp-tweets/final_project/part-00003-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json


In [5]:
# file size
!hadoop fs -du -s -h 'gs://msca-bdp-tweets/final_project/'

498.7 G  498.7 G  gs://msca-bdp-tweets/final_project


In [None]:
%%time
full_twtr_df = spark.read.json('gs://msca-bdp-tweets/final_project/')

23/03/10 14:23:28 WARN org.apache.spark.sql.execution.datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.

In [None]:
%%time
full_twtr_df.count()

[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 710 ms, sys: 173 ms, total: 883 ms
Wall time: 3min 5s


                                                                                

99994342

In [8]:
# printing the full schema
full_twtr_df.printSchema()

root
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |   

In [9]:
# identifying structures
full_twtr_df.columns

['coordinates',
 'created_at',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'favorited',
 'filter_level',
 'geo',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'place',
 'possibly_sensitive',
 'quote_count',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'quoted_text',
 'reply_count',
 'retweet_count',
 'retweeted',
 'retweeted_from',
 'retweeted_status',
 'source',
 'text',
 'timestamp_ms',
 'truncated',
 'tweet_text',
 'user',
 'withheld_in_countries']

# Column Filtering Test

In [10]:
# Testing to find necessary columns for the analysis
limit_test_df = full_twtr_df.limit(3)

In [11]:
# Column List for Analysis
limit_test_df = limit_test_df.select(col('user.name'), col('user.screen_name'), col('user.verified'), col('user.friends_count'), col('user.followers_count'),col('user.location'), col('user.description'), col('user.favourites_count'),col('retweeted_status.reply_count'),col('retweeted_status.retweet_count'),col('user.created_at'),'tweet_text')

In [12]:
limit_test_df 

                                                                                

name,screen_name,verified,friends_count,followers_count,location,description,favourites_count,reply_count,retweet_count,created_at,tweet_text
Jenna🕊,Lovelykopf8,False,289,126,"Delaware, USA",#BLM ||| we accep...,19864,32,3596,Mon Aug 15 02:14:...,14 students kille...
Colton Potter,PotterColton,False,296,570,,BYU | BYUtv Sport...,5141,224,2814,Thu Jan 12 01:01:...,"""14 kids dead in ..."
22.06.10,BTSARMY60965931,False,108,15,Seoul,22/06/10,10667,1,12,Mon Jun 07 18:37:...,@ScottishSuzee @C...


# Filtering Rows Test

In [13]:
# Small testing sample
limit_test_df2 = full_twtr_df.limit(1000)

In [60]:
limit_test_df2.count()

                                                                                

1000

In [16]:
limit_test_df2.show(10)

[Stage 19:>                                                         (0 + 1) / 1]

+-----------+--------------------+------------------+--------------------+-----------------+--------------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+-------------+----------------+--------------------+-----------------------+-----------+-----------+-------------+---------+--------------------+--------------------+--------------------+--------------------+-------------+---------+--------------------+--------------------+---------------------+
|coordinates|          created_at|display_text_range|            entities|extended_entities|      extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_user_id_str|is_quote_status|lang|

                                                                                

In [19]:
# casting columns as appropriate data types
full_twtr_df = full_twtr_df.withColumn("tweet_text", full_twtr_df["tweet_text"].cast("string"))
full_twtr_df = full_twtr_df.withColumn('reply_count', col('retweeted_status.reply_count').cast(IntegerType()))
full_twtr_df = full_twtr_df.withColumn('retweet_count', col('retweeted_status.retweet_count').cast(IntegerType()))

In [55]:
# # Filtering by keywords in the tweet
# filt_key_words = ['education','school','learn','student','professor','college','university','curriculum','teach','classroom','e-learning','MOOC','academic','edtech','lifelong learning',
#                   'budget cuts','dropout','illiteracy','achievement gap','standardized tests','bullying','special needs', 'academia',
#                   'STEAM' ,'STEM','Science','Technology','Engineering','Arts','Mathematics','flipped classroom',
#                 'graduation','college readiness','job readiness','career advancement','workforce development','homeschooling','polytechnic']
# condition = "|".join(filt_key_words)

# # Filtering by one word strings and a combination of several strings for contextual relevancy
# test_filtered_df = limit_test_df2.filter(lower(col('tweet_text')).rlike(condition) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('civics')) |\
#                                          (lower(col('tweet_text')).rlike('school') & lower(col('tweet_text')).rlike('underfunded')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('inequality')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('innovation')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('modernization')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('technology integration')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('digital divide')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('mental health')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('financial literacy')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('substance abuse')) |\
#                                          (lower(col('tweet_text')).rlike('school') & lower(col('tweet_text')).rlike('substance abuse')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('engagement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('parent involvement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('achievement')) |\
#                                          (lower(col('tweet_text')).rlike('scholar') & lower(col('tweet_text')).rlike('achievement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('progress')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('growth')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('proficiency')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('mastery')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('excellence')) |\
#                                          (lower(col('tweet_text')).rlike('student') & lower(col('tweet_text')).rlike('excellence')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('high performance')) |\
#                                          (lower(col('tweet_text')).rlike('student') & lower(col('tweet_text')).rlike('engagement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('success rate')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('affordability')))

In [None]:
# filtering all tweets on the premise of the trend of climate change
filt_df = full_twtr_df.filter((lower(col('tweet_text')).contains('climate change')))

In [56]:
# count the number of filtered rows
test_filtered_df.count()

                                                                                

986

In [57]:
# # percent filtered out
# filt_percent = test_filtered_df.count()/limit_test_df2.count()
# print(f'Filtered Percent = {filt_percent}%')
# print(f'Filtered Out Percent = {1-filt_percent}%')



Filtered Percent = 0.991%
Filtered Out Percent = 0.009000000000000008%


                                                                                

In [58]:
# preview the querry output
test_filtered_df.select('tweet_text').limit(10).toPandas()

                                                                                

Unnamed: 0,tweet_text
0,"Democratic Socialism is best form of Govt. All western countries but US, provide healthcare, college, daycare, senior care. Yes maybe 5-10% more taxes but everyone goes to sleep at night w/o worrying about the basics. We've been brainwashed to worship unchecked capitalism."
1,I’m ready for school but I’m not ready for school … y’all know what I mean
2,"Nearly 400 “good guys” responded to the Uvalde shooting: 5 school officers, 25 Uvalde officers, 16 sheriff's deputies, neighboring law enforcement, 149 Border Patrol officers, 91 state police officers, 13 US Marshals and 8 federal DEA officers.\n\nThey couldn’t stop one bad guy."
3,"With the 17th pick in the #MLBDraft, the Phillies have selected Justin Crawford, an 18-year-old outfielder from Bishop Gorman High School (NV). https://t.co/MdbamfCV6y"
4,here's a thread of missing children from robb elementary school in texas\n\nif you see this thread you're obligated to retweet it.
5,LONDONERS will swelter through their hottest day on record this week as catastrophic climate change wreaks havoc across Europe. Schools will close as the UK grinds to a halt to cope with the heat as fires burn through Europe. Australian cities will soon experience 50 degree days.
6,"@TheSizzleReport DeVos shouldn’t be allowed near any school\n\nThe people DeVos was inciting, er, speaking to are minority but they’re a force\n\nDo yourself &amp; all the children in your municipality a solid &amp; know who you want &amp; really DON’T want to win local elections including school boards"
7,forgot to tell y’all but my mom told me she’s gonna get condoms for in my first aid box at school for “just in case”
8,@AlfonsoforMI4 @NathanOHaraPhD And I should have been popular in high school. \nI'm with you wishing this. https://t.co/C6EJA9ToMg
9,"@tedcruz @michaeljknowles Wow, that’s all you got, maybe discuss how Texas has a killing kids problem or grid problem or border problem or it’s the second worst place to in in the country! Womens rights gone, schools defunded, but no you want to talk about tacos! What the hell man 🤦‍♀️"


# Filtered Final DF

#### Filtering by rows

In [26]:
# casting columns as appropriate data types
full_twtr_df = full_twtr_df.withColumn("tweet_text", full_twtr_df["tweet_text"].cast("string"))
full_twtr_df = full_twtr_df.withColumn('reply_count', col('retweeted_status.reply_count').cast(IntegerType()))
full_twtr_df = full_twtr_df.withColumn('retweet_count', col('retweeted_status.retweet_count').cast(IntegerType()))

In [11]:
# # Filtering by keywords in the tweet
# filt_key_words = ['education','school','learn','student','professor','college','university','curriculum','teach','classroom','e-learning','MOOC','academic','edtech','lifelong learning',
#                   'budget cuts','dropout','illiteracy','achievement gap','standardized tests','bullying','special needs', 'academia',
#                   'STEAM' ,'STEM','Science','Technology','Engineering','Arts','Mathematics','flipped classroom',
#                 'graduation','college readiness','job readiness','career advancement','workforce development','homeschooling','polytechnic']
# condition = "|".join(filt_key_words)

# # Filtering by one word strings and a combination of several strings for contextual relevancy
# filt_df = full_twtr_df.filter(lower(col('tweet_text')).rlike(condition) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('civics')) |\
#                                          (lower(col('tweet_text')).rlike('school') & lower(col('tweet_text')).rlike('underfunded')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('inequality')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('innovation')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('modernization')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('technology integration')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('digital divide')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('mental health')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('financial literacy')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('substance abuse')) |\
#                                          (lower(col('tweet_text')).rlike('school') & lower(col('tweet_text')).rlike('substance abuse')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('engagement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('parent involvement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('achievement')) |\
#                                          (lower(col('tweet_text')).rlike('scholar') & lower(col('tweet_text')).rlike('achievement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('progress')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('growth')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('proficiency')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('mastery')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('excellence')) |\
#                                          (lower(col('tweet_text')).rlike('student') & lower(col('tweet_text')).rlike('excellence')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('high performance')) |\
#                                          (lower(col('tweet_text')).rlike('student') & lower(col('tweet_text')).rlike('engagement')) |\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('success rate')) |\
#                                          (~col('user.location').rlike('None'))|\
#                                          (lower(col('tweet_text')).rlike('education') & lower(col('tweet_text')).rlike('affordability')))



In [None]:
# # Filtering by keywords in the tweet
# filt_key_words = ['education civics','school underfunded','modernizating education','education innovation','technology integration','university tuition','e-learning','academic advisor','edtech','lifelong learning',
#                   'school budget cuts', 'child literacy','dropout rate','illiteracy among children','achievement gap','standardized tests','school bullying','special needs education', 'scholorship',
#                   'steam' ,'stem','flipped classroom','graduation rate','college readiness','job readiness','career advising','homeschooling']
# condition = "|".join(filt_key_words)

# # Filtering by one word strings and a combination of several strings for contextual relevancy
# filt_df = full_twtr_df.filter(lower(col('tweet_text')).rlike(condition) | (~col('user.location').rlike('None')))

In [14]:
# # Filtering by one word strings and a combination of several strings for contextual relevancy
# filt_df = full_twtr_df.filter((lower(col('tweet_text')).contains('climate change'))&(lower(col('tweet_text')).contains('education')))

In [27]:
# filtering all tweets on the premise of the trend of climate change
filt_df = full_twtr_df.filter((lower(col('tweet_text')).contains('climate change')))

In [None]:
%%time
# count the number of filtered rows
filt_df.count()



CPU times: user 3.92 s, sys: 687 ms, total: 4.61 s
Wall time: 31min 25s


                                                                                

87313

In [None]:
# percent filtered out
filt_percent = filt_df.count()/full_twtr_df.count()
print(f'Filtered Percent = {filt_percent}%')
print(f'Filtered Out Percent = {1-filt_percent}%')

[Stage 39:>                                                         (0 + 1) / 1]

Filtered Percent = 0.0008731794044907061%
Filtered Out Percent = 0.9991268205955093%


                                                                                

In [None]:
# preview the querry output
filt_df.select('tweet_text').limit(10).toPandas()

                                                                                

Unnamed: 0,tweet_text
0,"If the WEF leaders cared about climate change, they would just meet over zoom, like they forced millions of children over the world to do when they pushed for school closures and locked them out of their classrooms during covid."
1,Teachers: climate change is real kids….also teachers: all schools should have air conditioning! 🤦‍♂️
2,"@DaniCar3782 @mariewalsh18 @RonniSalt He scares me to think he could get into power one day. He is only for spying, big business/small business but disabled, hospitals, One Nations people, schools, immigration, aged care, climate change, etc etc are never mentioned. Little moustache may help his profile."
3,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
4,"Not that we need further evidence of Biden’s incompetence, but if the Pres’ primary responsibility is the security of his citizens, his failure to secure the border and our schools is dereliction of duty. Climate change, free child care, forgiveness of student debt is all bull."
5,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
6,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
7,"U.N. Secretary-General Antonio Guterres said he was deeply concerned about the impacts of global climate change, during his Seton Hall University commencement address in South Orange, New Jersey https://t.co/1YoqFq4QsN"
8,The do nothing party. Tells you ignore the science. Climate change is a hoax. Vaccines are more dangerous to your liberty than COVID. And your children will be safe in school if we arm teachers to face shooters with assault weapons wearing body armor. Vote different.
9,Being schooled on climate change by a high school drop out. She arrived in America in a half million dollar sailboat. Those sailboats have diesel engines for flat calm seas and so they can cruise up to the dock under control. Hypocrisy.


#### Filtering by columns 

In [None]:
final_df = filt_df.select(col('user.name'), col('user.screen_name'), col('user.verified'), col('user.friends_count'), col('user.followers_count'),col('user.location'), col('user.description'), col('user.favourites_count'),col('retweeted_status.reply_count'),col('retweeted_status.retweet_count'),col('user.created_at'),'tweet_text')

In [None]:
final_df.limit(10).toPandas()

                                                                                

Unnamed: 0,name,screen_name,verified,friends_count,followers_count,location,description,favourites_count,reply_count,retweet_count,created_at,tweet_text
0,Joanne D,JoanneD66624776,False,428,135,,"Country girl raised in Ontario. Love the outdoors, campfires and fishing. Bookkeeper for 40 years. Conservative",14533,9.0,41.0,Mon Feb 15 19:51:44 +0000 2021,"If the WEF leaders cared about climate change, they would just meet over zoom, like they forced millions of children over the world to do when they pushed for school closures and locked them out of their classrooms during covid."
1,Ray Belfour,ray_belfour,False,615,405,,grumpy old conservative sick of these lefty do gooders,61986,5.0,13.0,Fri Jan 19 02:41:17 +0000 2018,Teachers: climate change is real kids….also teachers: all schools should have air conditioning! 🤦‍♂️
2,Veronica,wisteriameadow,False,3650,784,,,109805,0.0,1.0,Sun Dec 07 00:13:32 +0000 2014,"@DaniCar3782 @mariewalsh18 @RonniSalt He scares me to think he could get into power one day. He is only for spying, big business/small business but disabled, hospitals, One Nations people, schools, immigration, aged care, climate change, etc etc are never mentioned. Little moustache may help his profile."
3,SoCalCouple222,SCouple222,False,951,118,,Lookin’ in all the wrong places,16937,89.0,428.0,Sat Oct 23 17:11:07 +0000 2021,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
4,TheAnonymousSource,mgreenberg512,False,116,119,"The Drip, TX",New Media Maven - Blogger - Progressive Who Got Woke - Musicologist - Zionista,916,,,Fri May 01 18:15:50 +0000 2015,"Not that we need further evidence of Biden’s incompetence, but if the Pres’ primary responsibility is the security of his citizens, his failure to secure the border and our schools is dereliction of duty. Climate change, free child care, forgiveness of student debt is all bull."
5,Marie Van Pelt-Grier,weepelt,False,689,411,New Mexico,Progressive ideals. Tax the http://billionaires.Healthcare for all. Less money on wars..more on education. Against Trumpism/GOP. Pro Biden\nMy views are my own,72784,91.0,430.0,Mon Jun 15 05:53:34 +0000 2009,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
6,Media is compromised. GOP = Fascists 🇺🇦🌻,NanniMagee,False,8134,7399,,"🌊💙 BIDEN / HARRIS! ! An action a day, every single day to stop fascism. Bay Area, 👍🏼 please no DM. 🙏🏼 3rd party voters suck!! GOTV! dammit.",297094,91.0,431.0,Sat Jan 28 19:26:08 +0000 2017,"So Republicans are willing to spend billions of $$ to increase “security” at schools, but nothing to stop inflation, help those suffering from Covid, fight climate change, expand Medicare, improve education, etc. \n\nToday’s Republicans are way out of step with the American people."
7,MrGumede...,MrGumede91,False,411,8,,...,1,16.0,8.0,Thu May 05 17:51:29 +0000 2022,"U.N. Secretary-General Antonio Guterres said he was deeply concerned about the impacts of global climate change, during his Seton Hall University commencement address in South Orange, New Jersey https://t.co/1YoqFq4QsN"
8,Patrick Mauro,PMIA7,False,1693,1895,"Oak Brook, IL","Democrat. Vehemently oppose GOP of insurrection. trump & his mob must be held accountable for Jan 6 terror at Capitol. ProENVIR, gun ctl,&vote rts. USArmyvet.",38390,3.0,19.0,Fri Aug 01 12:15:30 +0000 2014,The do nothing party. Tells you ignore the science. Climate change is a hoax. Vaccines are more dangerous to your liberty than COVID. And your children will be safe in school if we arm teachers to face shooters with assault weapons wearing body armor. Vote different.
9,Micah,micah40732315,False,3771,6253,"Massachusetts, USA",Interested in everything,19207,,,Fri Jun 18 17:37:51 +0000 2021,Being schooled on climate change by a high school drop out. She arrived in America in a half million dollar sailboat. Those sailboats have diesel engines for flat calm seas and so they can cruise up to the dock under control. Hypocrisy.


#### Retriving a small to be used for analysis

In [None]:
# taking a sample of the full data to be used for analysis
final_df_sample = final_df.sample(False, 0.11, seed = 1)

In [35]:
%%time
final_df_sample.count()

[Stage 47:>                                                         (0 + 1) / 1]

CPU times: user 713 ms, sys: 198 ms, total: 911 ms
Wall time: 3min 46s


                                                                                

9567

# Saving the DFs

#### Saving the final df into a parque file

In [36]:
%%time
final_df.write.mode('overwrite').format('parquet').option("compression", "snappy").save('gs://msca-bdp-students-bucket/shared_data/danilm/final_df')

                                                                                

#### Saving the sample df into a parque file

In [None]:
%%time
final_df_sample.write.mode('overwrite').format('parquet').option("compression", "snappy").save('gs://msca-bdp-students-bucket/shared_data/danilm/final_df_sample')

                                                                                