In [15]:
import pandas as pd
import  matplotlib.pyplot as plt
from  datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Time:", current_time)

# Imports data from csv file
initial_df= pd.read_csv('../raw_data/raw_latest_polls.csv', parse_dates=['Start Date', 'End Date'], dayfirst=True)

Current Time: 2024-06-10 16:18:19


In [16]:
# Renames columns to match old CSV
initial_df.rename(columns={'Start Date': 'startdate'}, inplace=True)
initial_df.rename(columns={'End Date': 'enddate'}, inplace=True)
initial_df.rename(columns={'Area': 'countrycode'}, inplace=True)
initial_df.rename(columns={'Pollster': 'pollster'}, inplace=True)
initial_df.rename(columns={'Client': 'client'}, inplace=True)
initial_df.rename(columns={'Party': 'partycode'}, inplace=True)
initial_df.rename(columns={'Voting Intention': 'votingintention'}, inplace=True)


In [17]:
# Converts to datetime
initial_df['startdate'] = pd.to_datetime(initial_df['startdate'])
initial_df['enddate'] = pd.to_datetime(initial_df['enddate'])

In [18]:
initial_df['samplesize'] = 0

In [19]:
# Removes spaces and ampersands in pollster names
initial_df['pollster'] = initial_df['pollster'].str.replace(' ', '').str.replace('&', '').str.replace('-', '')

# Creates unique index for each poll
df_uuid = initial_df.set_index(initial_df['enddate'].dt.strftime('%Y-%m-%d').apply(str).str.replace('-', '_') + '_' + initial_df['pollster'])

# Pivots table to create column for each party
df = df_uuid.pivot_table(values="votingintention", index=[df_uuid.index,\
                                                                        'startdate', 'enddate', 'pollster', 'samplesize', 'countrycode'], columns=['partycode'])
df.reset_index(level=['startdate', 'enddate', 'pollster', 'samplesize', 'countrycode'], inplace=True)
df.reset_index(drop=True, inplace=True)


In [20]:
# Filters to after January 1, 1988
df = df[df['enddate'] > '2024-05-22']

In [21]:
df['countrycode'] = df['countrycode'].replace('Great Britain', 'GBR')

In [22]:
df = df[df['countrycode'] == 'GBR']


In [23]:
df['pollster'].value_counts()

pollster
YouGov                6
MoreinCommon          5
SavantaComRes         3
Survation             3
Techne                3
Redfieldamp;Wilton    3
Deltapoll             3
Opinium               3
Omnisis               3
JLPartners            2
LordAshcroft          2
BMG                   2
WhitestoneInsight     2
Focaldata             2
FindOutNow            1
Verian                1
IpsosMORI             1
Norstat               1
Name: count, dtype: int64

In [24]:
df = df.rename(columns={
    'Conservative': 'CON',
    'Labour': 'LAB',
    'The Brexit Party': 'BRX',
    'Liberal Democrats': 'LIB',
    'Green': 'GRE',
    'SNP': 'SNP',
    'Plaid Cymru': 'PLC',
    'UK Independence Party (UKIP)': 'UKI',
    'Other': 'OTH'
})

In [38]:
# Adds pollster rating, drawn from Election Data Vault
pollster_ratings = {
    'Populus': 'D+',
    'ICM': 'D+',
    'IpsosMORI': 'A-',
    'YouGov': 'A-',
    'SavantaComRes': 'B+',
    'BPIX': 'F',
    'AngusReid': 'F',
    'Harris': 'C-',
    'TNSBMRB': 'D',
    'Opinium': 'A-',
    'Survation': 'A-',
    'LordAshcroft': 'D-',
    'Panelbase': 'A-',
    'BMG': 'B',
    'ORB': 'D+',
    'Kantar': 'B+',
    'Deltapoll': 'D+',
    'NumberCruncherPolitics': 'D',
    'Focaldata': 'D+',
    'RedfieldWilton': 'D',
    'JLPartners': 'D',
    'FindOutNow': 'D',
    'Omnisis': 'D',
    'Techne': 'D',
    'PeoplePolling': 'D',
    'MoreinCommon': 'F',
    'MORI': 'B',
    'Marplan': 'D-',
    'NOP': 'D-',
    'Gallup': 'D+',
    'AudienceSelection': 'D-',
    'NMR:': 'F'
}
# Adds rating column
df['pollster_rating'] = df['pollster'].map(pollster_ratings)

In [27]:
old_polls = pd.read_csv('../processed_data/gb_polls.csv')

In [37]:
old_polls

Unnamed: 0.1,Unnamed: 0,startdate,enddate,pollster,pollster_rating,next_elec_date,days_until_next_election,BRX,CON,GRE,LAB,LIB,OTH,PLC,SNP,UKI,election_year
0,1390,1988-01-11,1988-01-11,Marplan,D-,1992-04-09,1550,,42.0,,40.0,17.0,,,,,1992
1,1391,1988-01-11,1988-01-11,NOP,D-,1992-04-09,1550,,47.0,,35.0,16.0,,,,,1992
2,1392,1988-01-18,1988-01-18,Gallup,D+,1992-04-09,1543,,46.0,,37.0,15.0,,,,,1992
3,1393,1988-01-26,1988-01-26,MORI,B,1992-04-09,1535,,50.0,,36.0,12.0,,,,,1992
4,1394,1988-02-08,1988-02-08,NOP,D-,1992-04-09,1522,,45.0,,37.0,14.0,,,,,1992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,41,,,MoreinCommon,F,2024-07-04,29,11.0,25.0,,46.0,9.0,,0.0,3.0,,2024
5705,42,,,Omnisis,D,2024-07-04,28,15.0,20.0,,45.0,10.0,,1.0,3.0,,2024
5706,43,,,Opinium,A-,2024-07-04,29,12.0,24.0,,42.0,10.0,,,3.0,,2024
5707,44,,,SavantaComRes,B+,2024-07-04,29,11.0,26.0,,46.0,10.0,,1.0,2.0,,2024


In [35]:
new_df = pd.concat([old_polls, df])

Unnamed: 0.1,Unnamed: 0,startdate,enddate,pollster,pollster_rating,next_elec_date,days_until_next_election,BRX,CON,GRE,...,election_year,samplesize,countrycode,Alliance,DUP,Green Party,SDLP,Sinn Féin,Traditional Unionist Voice – TUV,Ulster Unionist Party
0,1390.0,1988-01-11,1988-01-11,Marplan,D-,1992-04-09,1550.0,,42.0,,...,1992.0,,,,,,,,,
1,1391.0,1988-01-11,1988-01-11,NOP,D-,1992-04-09,1550.0,,47.0,,...,1992.0,,,,,,,,,
2,1392.0,1988-01-18,1988-01-18,Gallup,D+,1992-04-09,1543.0,,46.0,,...,1992.0,,,,,,,,,
3,1393.0,1988-01-26,1988-01-26,MORI,B,1992-04-09,1535.0,,50.0,,...,1992.0,,,,,,,,,
4,1394.0,1988-02-08,1988-02-08,NOP,D-,1992-04-09,1522.0,,45.0,,...,1992.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943,,2024-06-05 00:00:00,2024-06-07 00:00:00,MoreinCommon,,,,11.0,25.0,,...,,0.0,GBR,,,6.0,,,,
1944,,2024-06-06 00:00:00,2024-06-07 00:00:00,Omnisis,,,,15.0,20.0,,...,,0.0,GBR,,,5.0,,,,
1945,,2024-06-05 00:00:00,2024-06-07 00:00:00,Opinium,,,,12.0,24.0,,...,,0.0,GBR,,,7.0,,,,
1947,,2024-06-05 00:00:00,2024-06-07 00:00:00,SavantaComRes,,,,11.0,26.0,,...,,0.0,GBR,,,3.0,,,,
