In [1]:
!pip install realclearpolitics==1.4.0



In [2]:
!rcp https://www.realclearpolitics.com/epolls/2020/president/tx/texas_trump_vs_biden-6818.html

Downloading: texas_trump_vs_biden-6818.csv
CSV created.


In [3]:
!ls -lh

total 88K
-rw-r--r--. 1 jovyan  1000  34K Nov 15 18:05 polling_evaluation.ipynb
-rw-r--r--. 1 jovyan  1000  30K Nov 15 17:35 polling_evaluation.ipynb.orig
-rw-r--r--. 1 jovyan  1000 4.0K Nov 15 17:43 polling_evaluation.py
-rw-r--r--. 1 jovyan  1000 5.9K Nov 15 17:31 polling_evaluation.py.orig
-rw-r--r--. 1 jovyan  1000  340 Nov 12 06:23 README.md
drwxr-xr-x. 2 jovyan users    6 Nov 12 04:21 spark-warehouse
-rw-r--r--. 1 jovyan users 2.5K Nov 15 18:13 texas_trump_vs_biden-6818.csv


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = (
    SparkSession.builder
    .master("local")
    .appName("polling_error")
    #.config("spark.some.config.option", "some-value")
    .getOrCreate()
)

In [6]:
df = spark.read.csv('texas_trump_vs_biden-6818.csv',
                    header=True,
                    inferSchema=True,
                    enforceSchema=False,
                    mode='FAILFAST')

df.createOrReplaceTempView('tx_pres_polls_raw')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_polls_raw
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_polls_raw
    LIMIT 10
''').show(truncate=False)

schema
+---------+---------+-------+
|col_name |data_type|comment|
+---------+---------+-------+
|Poll     |string   |null   |
|Date     |string   |null   |
|Sample   |string   |null   |
|MoE      |string   |null   |
|Trump (R)|double   |null   |
|Biden (D)|double   |null   |
|Spread   |string   |null   |
+---------+---------+-------+

peek
====
+----------------------------------+-------------+-------+---+---------+---------+----------+
|Poll                              |Date         |Sample |MoE|Trump (R)|Biden (D)|Spread    |
+----------------------------------+-------------+-------+---+---------+---------+----------+
|Final Results                     |--           |--     |-- |52.2     |46.4     |Trump +5.8|
|RCP Average                       |10/20 - 10/31|--     |-- |47.8     |46.5     |Trump +1.3|
|EmersonEmerson                    |10/29 - 10/31|763 LV |3.5|49.0     |49.0     |Tie       |
|UMass Lowell*UMass Lowell*        |10/20 - 10/26|873 LV |4.2|48.0     |47.0     |Trump 

In [7]:
spark.sql('''
    SELECT `Poll` as poll,
           `Date` as date_range,
           `Sample` as sample_size,
           `MoE` as margin_of_error,
           `Trump (R)` as trump,
           `Biden (D)` as biden,
           `Spread` as spread
    FROM tx_pres_polls_raw
''').createOrReplaceTempView('tx_pres_polls_stage1')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_polls_stage1
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_polls_stage1
    LIMIT 10
''').show(truncate=False)

schema
+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|poll           |string   |null   |
|date_range     |string   |null   |
|sample_size    |string   |null   |
|margin_of_error|string   |null   |
|trump          |double   |null   |
|biden          |double   |null   |
|spread         |string   |null   |
+---------------+---------+-------+

peek
====
+----------------------------------+-------------+-----------+---------------+-----+-----+----------+
|poll                              |date_range   |sample_size|margin_of_error|trump|biden|spread    |
+----------------------------------+-------------+-----------+---------------+-----+-----+----------+
|Final Results                     |--           |--         |--             |52.2 |46.4 |Trump +5.8|
|RCP Average                       |10/20 - 10/31|--         |--             |47.8 |46.5 |Trump +1.3|
|EmersonEmerson                    |10/29 - 10/31|763 LV     |3.5           

In [8]:
spark.sql('''
    SELECT poll,
           '2020-11-03' as election_date,
           trump,
           biden,
           lower(split(spread, ' ', -1)[0]) as winner,
           CASE WHEN split(spread, ' ', -1)[0] = 'Tie'
                THEN 0.0
                ELSE cast(split(spread, ' ', -1)[1] as double) END AS spread
    FROM tx_pres_polls_stage1
    WHERE poll = 'Final Results'
''').createOrReplaceTempView('tx_pres_results')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_results
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_results
    LIMIT 10
''').show(truncate=False)

schema
+-------------+---------+-------+
|col_name     |data_type|comment|
+-------------+---------+-------+
|poll         |string   |null   |
|election_date|string   |null   |
|trump        |double   |null   |
|biden        |double   |null   |
|winner       |string   |null   |
|spread       |double   |null   |
+-------------+---------+-------+

peek
====
+-------------+-------------+-----+-----+------+------+
|poll         |election_date|trump|biden|winner|spread|
+-------------+-------------+-----+-----+------+------+
|Final Results|2020-11-03   |52.2 |46.4 |trump |5.8   |
+-------------+-------------+-----+-----+------+------+



In [9]:
spark.sql('''
    SELECT poll, 
           date_range,
           cast(split(sample_size, ' ', -1)[0] as int) as sample_size,
           cast(margin_of_error as double) as margin_of_error,
           trump,
           biden,
           lower(split(spread, ' ', -1)[0]) as winner,
           split(spread, ' ', -1)[0] = 'Trump' as is_winner_correct,
           CASE WHEN split(spread, ' ', -1)[0] = 'Tie'
                THEN 0.0
                ELSE cast(split(spread, ' ', -1)[1] as double) END AS spread
    FROM tx_pres_polls_stage1
    WHERE poll != 'Final Results'
''').createOrReplaceTempView('tx_pres_polls_stage2')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_polls_stage2
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_polls_stage2
    LIMIT 10
''').show()

schema
+-----------------+---------+-------+
|col_name         |data_type|comment|
+-----------------+---------+-------+
|poll             |string   |null   |
|date_range       |string   |null   |
|sample_size      |int      |null   |
|margin_of_error  |double   |null   |
|trump            |double   |null   |
|biden            |double   |null   |
|winner           |string   |null   |
|is_winner_correct|boolean  |null   |
|spread           |double   |null   |
+-----------------+---------+-------+

peek
====
+--------------------+-------------+-----------+---------------+-----+-----+------+-----------------+------+
|                poll|   date_range|sample_size|margin_of_error|trump|biden|winner|is_winner_correct|spread|
+--------------------+-------------+-----------+---------------+-----+-----+------+-----------------+------+
|         RCP Average|10/20 - 10/31|       null|           null| 47.8| 46.5| trump|             true|   1.3|
|      EmersonEmerson|10/29 - 10/31|        763|    

In [10]:
spark.sql('''
    SELECT *,
           round(abs((SELECT trump from tx_pres_results) - trump), 1) as trump_distance,
           round(abs((SELECT biden from tx_pres_results) - biden), 1) as biden_distance
    FROM tx_pres_polls_stage2
''').createOrReplaceTempView('tx_pres_polls_stage3')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_polls_stage3
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_polls_stage3
    LIMIT 10
''').show()

schema
+-----------------+---------+-------+
|col_name         |data_type|comment|
+-----------------+---------+-------+
|poll             |string   |null   |
|date_range       |string   |null   |
|sample_size      |int      |null   |
|margin_of_error  |double   |null   |
|trump            |double   |null   |
|biden            |double   |null   |
|winner           |string   |null   |
|is_winner_correct|boolean  |null   |
|spread           |double   |null   |
|trump_distance   |double   |null   |
|biden_distance   |double   |null   |
+-----------------+---------+-------+

peek
====
+--------------------+-------------+-----------+---------------+-----+-----+------+-----------------+------+--------------+--------------+
|                poll|   date_range|sample_size|margin_of_error|trump|biden|winner|is_winner_correct|spread|trump_distance|biden_distance|
+--------------------+-------------+-----------+---------------+-----+-----+------+-----------------+------+--------------+-----------

In [11]:
spark.sql('''
    SELECT *,
           round(abs(trump_distance + biden_distance), 1) as total_distance
    FROM tx_pres_polls_stage3
''').createOrReplaceTempView('tx_pres_polls')

print('schema\n'
      '======')
spark.sql('''
    DESCRIBE tx_pres_polls
''').show(truncate=False)

print('peek\n'
      '====')
spark.sql('''
    SELECT *
    FROM tx_pres_polls
    LIMIT 10
''').show()

schema
+-----------------+---------+-------+
|col_name         |data_type|comment|
+-----------------+---------+-------+
|poll             |string   |null   |
|date_range       |string   |null   |
|sample_size      |int      |null   |
|margin_of_error  |double   |null   |
|trump            |double   |null   |
|biden            |double   |null   |
|winner           |string   |null   |
|is_winner_correct|boolean  |null   |
|spread           |double   |null   |
|trump_distance   |double   |null   |
|biden_distance   |double   |null   |
|total_distance   |double   |null   |
+-----------------+---------+-------+

peek
====
+--------------------+-------------+-----------+---------------+-----+-----+------+-----------------+------+--------------+--------------+--------------+
|                poll|   date_range|sample_size|margin_of_error|trump|biden|winner|is_winner_correct|spread|trump_distance|biden_distance|total_distance|
+--------------------+-------------+-----------+---------------+--

In [12]:
print('actual results\n'
      '==============')
spark.sql('''
    SELECT *
    FROM tx_pres_results
    LIMIT 10
''').show(truncate=False)

actual results
+-------------+-------------+-----+-----+------+------+
|poll         |election_date|trump|biden|winner|spread|
+-------------+-------------+-----+-----+------+------+
|Final Results|2020-11-03   |52.2 |46.4 |trump |5.8   |
+-------------+-------------+-----+-----+------+------+



In [13]:
print('top 5 polls\n'
      '===========')
spark.sql('''
    SELECT *
    FROM tx_pres_polls
    ORDER BY total_distance ASC
    LIMIT 5
''').toPandas()

top 5 polls


Unnamed: 0,poll,date_range,sample_size,margin_of_error,trump,biden,winner,is_winner_correct,spread,trump_distance,biden_distance,total_distance
0,Texas LyceumTexas Lyceum,1/10 - 1/19,520,4.3,51.0,46.0,trump,True,5.0,1.2,0.4,1.6
1,UMass LowellUMLowell,9/18 - 9/25,882,4.3,50.0,46.0,trump,True,4.0,2.2,0.4,2.6
2,QuinnipiacQuinnipiac,9/17 - 9/21,1078,3.0,50.0,45.0,trump,True,5.0,2.2,1.4,3.6
3,Rasmussen ReportsRasmussen,10/5 - 10/6,1000,3.0,51.0,44.0,trump,True,7.0,1.2,2.4,3.6
4,University of HoustonU. of Houston,10/13 - 10/20,1000,3.1,50.0,45.0,trump,True,5.0,2.2,1.4,3.6


In [14]:
print('bottom 5 polls\n'
      '==============')
spark.sql('''
    SELECT *
    FROM tx_pres_polls
    ORDER BY total_distance DESC
    LIMIT 5
''').toPandas()

bottom 5 polls


Unnamed: 0,poll,date_range,sample_size,margin_of_error,trump,biden,winner,is_winner_correct,spread,trump_distance,biden_distance,total_distance
0,UT/Texas TribuneUT/ TX Tribune,10/18 - 10/27,1200,2.8,46.0,39.0,trump,True,7.0,6.2,7.4,13.6
1,Dallas Morning NewsDMN,4/18 - 4/27,1183,2.9,43.0,43.0,tie,False,0.0,9.2,3.4,12.6
2,QuinnipiacQuinnipiac,5/28 - 6/1,1166,2.9,44.0,43.0,trump,True,1.0,8.2,3.4,11.6
3,Dallas Morning NewsDMN,6/29 - 7/7,1677,2.4,43.0,48.0,biden,False,5.0,9.2,1.6,10.8
4,EmersonEmerson,5/8 - 5/10,800,3.4,47.0,41.0,trump,True,6.0,5.2,5.4,10.6
