In [94]:
import pandas as pd
import  matplotlib.pyplot as plt
from  datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Time:", current_time)

# Imports data from csv file
initial_df= pd.read_csv('../raw_data/raw_latest_polls.csv', parse_dates=['Start Date', 'End Date'], dayfirst=True)

Current Time: 2024-06-10 15:39:02


In [95]:
# Renames columns to match old CSV
initial_df.rename(columns={'Start Date': 'startdate'}, inplace=True)
initial_df.rename(columns={'End Date': 'enddate'}, inplace=True)
initial_df.rename(columns={'Area': 'countrycode'}, inplace=True)
initial_df.rename(columns={'Pollster': 'pollster'}, inplace=True)
initial_df.rename(columns={'Client': 'client'}, inplace=True)
initial_df.rename(columns={'Party': 'partycode'}, inplace=True)
initial_df.rename(columns={'Voting Intention': 'votingintention'}, inplace=True)


In [96]:
# Converts to datetime
initial_df['startdate'] = pd.to_datetime(initial_df['startdate'])
initial_df['enddate'] = pd.to_datetime(initial_df['enddate'])

In [97]:
initial_df['samplesize'] = 0

In [98]:
# Removes spaces and ampersands in pollster names
initial_df['pollster'] = initial_df['pollster'].str.replace(' ', '').str.replace('&', '').str.replace('-', '')

# Creates unique index for each poll
df_uuid = initial_df.set_index(initial_df['enddate'].dt.strftime('%Y-%m-%d').apply(str).str.replace('-', '_') + '_' + initial_df['pollster'])

# Pivots table to create column for each party
df = df_uuid.pivot_table(values="votingintention", index=[df_uuid.index,\
                                                                        'startdate', 'enddate', 'pollster', 'samplesize', 'countrycode'], columns=['partycode'])
df.reset_index(level=['startdate', 'enddate', 'pollster', 'samplesize', 'countrycode'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [99]:
# Filters to after January 1, 1988
df = df[df['enddate'] > '2024-05-22']

In [100]:
df['countrycode'] = df['countrycode'].replace('Great Britain', 'GBR')

In [101]:
df = df[df['countrycode'] == 'GBR']


In [102]:
df['pollster'].value_counts()

pollster
YouGov                6
MoreinCommon          5
SavantaComRes         3
Survation             3
Techne                3
Redfieldamp;Wilton    3
Deltapoll             3
Opinium               3
Omnisis               3
JLPartners            2
LordAshcroft          2
BMG                   2
WhitestoneInsight     2
Focaldata             2
FindOutNow            1
Verian                1
IpsosMORI             1
Norstat               1
Name: count, dtype: int64

In [87]:
df = df.rename(columns={
    'Conservative': 'CON',
    'Labour': 'LAB',
    'The Brexit Party': 'BRX',
    'Liberal Democrats': 'LIB',
    'Green': 'GRE',
    'SNP': 'SNP',
    'Plaid Cymru': 'PLC',
    'UK Independence Party (UKIP)': 'UKI',
    'Other': 'OTH'
})

In [103]:
df

partycode,startdate,enddate,pollster,samplesize,countrycode,Alliance,Conservative,DUP,Green Party,Labour,Liberal Democrats,Plaid Cymru,SDLP,SNP,Sinn Féin,The Brexit Party,Traditional Unionist Voice – TUV,UK Independence Party (UKIP),Ulster Unionist Party
1894,2024-05-22,2024-05-23,MoreinCommon,0,GBR,,27.0,,5.0,44.0,9.0,0.0,,3.0,,10.0,,,
1895,2024-05-22,2024-05-23,Techne,0,GBR,,19.0,,5.0,45.0,12.0,,,2.0,,14.0,,,
1896,2024-05-23,2024-05-24,Omnisis,0,GBR,,22.0,,6.0,47.0,8.0,,,3.0,,12.0,,,
1897,2024-05-23,2024-05-24,Opinium,0,GBR,,27.0,,7.0,41.0,10.0,,,2.0,,10.0,,,
1898,2024-05-23,2024-05-24,YouGov,0,GBR,,22.0,,6.0,44.0,9.0,0.0,,3.0,,14.0,,,
1899,2024-05-23,2024-05-25,Deltapoll,0,GBR,,23.0,,6.0,45.0,9.0,1.0,,3.0,,10.0,,,
1900,2024-05-24,2024-05-25,JLPartners,0,GBR,,28.0,,5.0,40.0,10.0,1.0,,3.0,,12.0,,,
1902,2024-05-24,2024-05-26,SavantaComRes,0,GBR,,27.0,,4.0,44.0,10.0,,,3.0,,8.0,,,
1903,2024-05-20,2024-05-27,FindOutNow,0,GBR,,19.0,,8.0,46.0,10.0,1.0,,3.0,,12.0,,,
1905,2024-05-25,2024-05-27,Redfieldamp;Wilton,0,GBR,,23.0,,5.0,46.0,9.0,1.0,,3.0,,13.0,,,


In [89]:
df.to_csv('../processed_data/processed_latest_polls.csv', index=True)