In [0]:
import pyspark
from pyspark.sql.functions import col, avg, count

In [0]:
# Loading the dataset from FiveThirtyEight GitHub repo.
datasource_url = "https://raw.githubusercontent.com/fivethirtyeight/covid-19-polls/refs/heads/master/covid_concern_polls.csv"
sc.addFile(datasource_url)
file_path = "file://" + pyspark.SparkFiles.get("covid_concern_polls.csv")
df = spark.read.csv(file_path, header=True, inferSchema=True, sep=",")

In [0]:
# View the data in a more readable format.
df.toPandas().head(5)

Unnamed: 0,start_date,end_date,pollster,sponsor,sample_size,population,party,subject,tracking,text,very,somewhat,not_very,not_at_all,url
0,2020-01-27,2020-01-29,Morning Consult,,2202.0,a,all,concern-economy,False,How concerned are you that the coronavirus wil...,19.0,33.0,23.0,11.0,https://morningconsult.com/wp-content/uploads/...
1,2020-01-31,2020-02-02,Morning Consult,,2202.0,a,all,concern-economy,False,How concerned are you that the coronavirus wil...,26.0,32.0,25.0,7.0,https://morningconsult.com/wp-content/uploads/...
2,2020-02-02,2020-02-04,YouGov,Economist,1500.0,a,all,concern-infected,False,Taking into consideration both your risk of co...,13.0,26.0,43.0,18.0,https://d25d2506sfb94s.cloudfront.net/cumulus_...
3,2020-02-07,2020-02-09,Morning Consult,,2200.0,a,all,concern-economy,False,How concerned are you that the coronavirus wil...,23.0,32.0,24.0,9.0,https://morningconsult.com/wp-content/uploads/...
4,2020-02-07,2020-02-09,YouGov,Huffington Post,1000.0,a,all,concern-infected,False,How concerned are you that you or someone in y...,11.0,24.0,33.0,20.0,https://projects.fivethirtyeight.com/polls/202...


### Data Transformation

In [0]:
pandas_df = df.toPandas()
pandas_df.shape

Out[31]: (686, 15)

In [0]:
# Data transformation code that only examines polls targeting registered voters.
registered_voter_polls = pandas_df[pandas_df["population"] == "rv"]
registered_voter_polls.shape

Out[32]: (214, 15)

In [0]:
registered_voter_polls.head(5)

Unnamed: 0,start_date,end_date,pollster,sponsor,sample_size,population,party,subject,tracking,text,very,somewhat,not_very,not_at_all,url
12,2020-02-24,2020-02-26,Morning Consult,,1994.0,rv,all,concern-economy,False,How concerned are you that the coronavirus wil...,32.0,39.0,17.0,5.0,https://morningconsult.com/wp-content/uploads/...
16,2020-03-02,2020-03-03,PPP,Protect Our Care,866.0,rv,all,concern-economy,False,How concerned are you about the impact the cor...,36.0,36.0,21.0,6.0,https://www.protectourcare.org/wp-content/uplo...
17,2020-03-02,2020-03-03,PPP,Protect Our Care,866.0,rv,all,concern-infected,False,How concerned are you that you or someone in y...,24.0,33.0,30.0,12.0,https://www.protectourcare.org/wp-content/uplo...
19,2020-03-03,2020-03-05,Morning Consult,,1990.0,rv,all,concern-economy,False,How concerned are you that the coronavirus wil...,38.0,37.0,12.0,5.0,https://morningconsult.com/wp-content/uploads/...
22,2020-03-06,2020-03-08,Global Strategy Group/GBAO/Navigator Research,,1000.0,rv,all,concern-economy,False,"Below is a list of possible events. For each, ...",36.0,37.0,16.0,11.0,https://navigatorresearch.org/wp-content/uploa...


### PySpark SQL Queries

Let's first examine how many of those pollsters do not have any sponsors:

In [0]:
df.filter(col("sponsor") != "").select("pollster").distinct().count()

Out[16]: 22

In [0]:
df.filter(col("sponsor") != "").select("pollster").distinct().orderBy("pollster").show(
    df.count(), False
)

+----------------------------------+
|pollster                          |
+----------------------------------+
|ABC                               |
|AP-NORC                           |
|Change Research                   |
|Civiqs                            |
|Emerson College Polling Society   |
|Greenberg Quinlan Rosner          |
|Harris Poll                       |
|Hart Research Associates          |
|Ipsos                             |
|Langer Research Associates        |
|Léger                             |
|Morning Consult                   |
|NORC                              |
|PPP                               |
|Public Religion Research Institute|
|PureSpectrum                      |
|RMG Research                      |
|SSRS                              |
|Survey 160                        |
|SurveyMonkey                      |
|SurveyMonkey Audience             |
|YouGov                            |
+----------------------------------+



Then I would like to examine when people have concerns on economy-related questions, how many records have more 'very concerned' records than 'somewhat concerned' records.

In [0]:
df.createOrReplaceTempView("covid_polls")

result = spark.sql("""SELECT count(*) FROM covid_polls WHERE very >= somewhat;""")
result.show()

+--------+
|count(1)|
+--------+
|     332|
+--------+



Lastly, we can take a look at the sentiment distribution of different groups of population (which in this dataset are adults, registered voters, and likely voters) for polls that have more 'very concerned' people than 'somewhat concerned' people:

In [0]:
result = spark.sql(
    """SELECT
    CASE population
        WHEN 'lv' THEN 'likely_voters'
        WHEN 'a' THEN 'adults'
        WHEN 'rv' THEN 'registered_voters'
    END AS population_category,
    COUNT(*) AS num_of_polls
    FROM covid_polls
    WHERE very >= somewhat GROUP BY population;"""
)
result.show()

+-------------------+------------+
|population_category|num_of_polls|
+-------------------+------------+
|      likely_voters|          13|
|             adults|         183|
|  registered_voters|         136|
+-------------------+------------+



It looks like among polls that have more people who are very concerned than people who are somewhat concerned, the majority of them are adults, followed by registered voters, and then likely voters. Whether or not the drastic difference between likely voters and the rest of the two categories come from inexperience in life or social exposure would be a much complicated issue to look into.