In [3]:
import hashlib

import pandas as pd
import numpy as np

from mlgear.utils import show, display_columns
from survey_dud_detector import detect_straightlining, detect_low_incidence


def is_valid_prolific_id(pid):
    if len(pid) != 24:
        return False
    try:
        pid = int(pid, 16)
        return True
    except:
        return False


def sjoin(x):
    return ';'.join(x[x.notnull()].astype(str))


def transform_age(age):
    if age < 18:
        return 'Under 18'
    elif age < 35:
        return '18-34'
    elif age < 55:
        return '35-54'
    elif age < 65:
        return '55-64'
    else:
        return '65 or older'


def transform_income(inc):
    if inc == 'Between $30,000 and $49,999' or inc == 'Between $15,000 and $29,999':
        return 'Between $15,000 and $49,999'
    else:
        return inc


def transform_education(educ):
    if educ in ['1st grade', '2nd grade', '3rd grade', '4th grade', '5th grade', '6th grade', '7th grade',
                '8th grade', '9th grade', '10th grade', '11th grade', 'Did not attend school']: 
        return 'Less than high school'
    if educ in ['1 year of college', '2 years of college', '3 years of college']:
        return 'Some college, no degree'
    if educ == 'Some graduate school':
        return 'Graduated from college'
    return educ


def transform_race(race):
    if race == 'White or Caucasian':
        return 'White, not Hispanic'
    elif race == 'Black or African American':
        return 'Black, non-Hispanic'
    elif race == 'Hispanic or Latino':
        return 'Hispanic'
    else:
        return 'Other'


def transform_2016_vote(vote):
    vote = vote.split(',')[0]
    if vote in ['Gary Johnson', 'Jill Stein', 'Another candidate']:
        return 'Other'
    return vote


def transform_2020_vote(vote):
    vote = vote.split(',')[0]
    if vote == 'Another candidate':
        return 'Other'
    return vote


def transform_gss_trust(trust):
    if trust == 'You can’t be too careful':
        return 'Can\'t be too careful'
    elif trust == 'Most people can be trusted':
        return 'Can trust'
    else:
        return trust
    

def transform_gss_bible(bible):
    if bible == 'The Bible is an ancient book of fables, legends, history, and moral precepts recorded by man':
        return 'Book of fables'
    elif bible == 'The Bible is the actual word of God and it is to be taken literally, word for word':
        return 'Word of God'
    elif bible == 'The Bible is the inspired word of God but not everything should be taken literally, word for word':
        return 'Inspired word'
    else:
        return bible


def process_vote_method(method):
    if method == 'By mail' or 'By mail' in str(method):
        return 'Absentee by mail'
    elif method == 'In person' or method == 'In person before election day':
        return 'Early vote in person'
    elif method == 'In person on election day':
        return 'I will vote on election day'
    elif method == 'Other':
        return 'Don\'t know'
    elif method == 'Don’t know':
        return 'Don\'t know'
    else:
        return method

    
def simplify_likert(likert):
    likert = likert.replace('’', '\'')
    if likert == 'Strongly agree':
        return 'Agree'
    elif likert == 'Strongly disagree':
        return 'Disagree'
    elif likert == 'Neither agree or disagree':
        return 'Don\'t know'
    elif likert == 'Neither agree nor disagree':
        return 'Don\'t know'
    else:
        return likert

In [4]:
survey_online = pd.read_csv('online_sample.csv')

variable_map = {'id': 'What is your Prolific ID?',
                'lv_thought': 'There are two runoff elections on January 5th that determine control of the Senate. How much thought have you given to these races?',
                'lv_eligible': 'Are you eligible to vote in the 2021 Georgia runoff election?',
                'lv_registered': 'Are you registered to vote in the 2021 Georgia runoff election?',
                'lv_plan': 'Do you plan to vote in the 2021 Georgia runoff election?',
                'voted': 'Have you already voted in the 2021 Georgia runoff election?',
                'vote_method_plan': 'How are you planning to vote?',
                'lv_likely': 'How likely are you to vote?',
                'vote_method': 'How did you vote?',
                'vote2016': 'In the 2016 Presidential election, who did you vote for?',
                'vote2020': 'In the 2020 Presidential election, who did you vote for?',
                'vote_ossoff_perdue': 'In the 2021 January Georgia regular election, who do you intend to vote for?',
                'vote_warnock_loeffler': 'In the 2021 January Georgia regular election, who do you intend to vote for?.1',
                'vote_ossoff_perdue2': 'In the 2021 January Georgia regular election, who did you vote for?',
                'vote_warnock_loeffler2': 'In the 2021 January Georgia special election, who did you vote for?',
                'not_voting_reason_ineligible': 'Which of these reasons explains why you are not voting in the 2021 January Georgia runoff elections? Please check all that apply.', # I’m not eligible to vote
                'not_voting_reason_uninterested': 'Unnamed: 25', # I’m not interested in voting
                'not_voting_reason_no_time': 'Unnamed: 26', # I don’t have enough time to vote
                'not_voting_reason_too_hard': 'Unnamed: 27', # Voting is too hard / complicated
                'not_voting_reason_knowledge': 'Unnamed: 28', # I don’t feel like I know enough to vote
                'not_voting_reason_no_like': 'Unnamed: 29', # I don’t like any of the candidates
                'not_voting_reason_rigged': 'Unnamed: 30', # The voting process is rigged and won't count my vote fairly
                'not_voting_reason_boycott': 'Unnamed: 31', # I am intentionally boycotting the runoff election
                'not_voting_reason_other': 'Unnamed: 33', # Another reason
                'enthusiasm': 'Compared to previous elections, are you more enthusiastic than usual about voting, or less enthusiastic?',
                'rigged_who_actually_won': 'If all legal votes were fairly counted, who do you think actually won the 2020 Presidential election in the state of Georgia?',
                'rigged_vote_counts': 'Do you trust the 2021 Georgia runoff election to accurately count your vote?',
                'rigged_trump_concede': 'Should Donald Trump concede and admit that he lost the 2020 Presidential election?',
                'gss_trust': "Generally speaking, would you say that most people can be trusted or that you can't be too careful in dealing with people?",
                'gss_bible': 'Which of these statements comes closest to describing your feelings about the Bible?',
                'gss_spanking': 'How much do you agree or disagree with the following?', # It is sometimes necessary to discipline a child with a good, hard spanking
                'birth_control': 'Unnamed: 41', # Birth control is morally wrong
                'factory_farming_ban': 'Unnamed: 42', # I support a ban on factory farming
                'slaughterhouse_ban': 'Unnamed: 43', # I support a ban on slaughterhouses
                'vegetarian_imperative': 'Unnamed: 44', # It is morally important to be vegetarian
                'covid_positive': 'Have you ever tested positive for COVID?',
                'covid_know_dead': 'Unnamed: 46', # Do you know anyone who has died because of COVID?
                'covid_trouble_bills': 'Unnamed: 47', # Have you or your family had trouble paying bills or rent because of COVID?
                'covid_reduced_work': 'Unnamed: 48', # Have you or anyone in your family lost their job or reduced their work hours due to COVID?
                'social_fb': 'Which of these social media networks do you use?', # Facebook
                'gender': 'What is your gender?',
                'birth_year': 'In which year were you born? Please write your answer as 4 digits only',
                'race': 'What is your race?',
                'education': 'What is the highest level of education you have completed?',
                'income': 'What is your annual income?',
                'urban_rural': 'Which of the following best describes the area in which you live?',
                'loc_county': 'Which of these counties do you live in?',
                'loc_north_south': 'Would you say you live closer to Florida or closer to Tennessee?',
                'loc_east_west': 'Would you say you live closer to Alabama or closer to South Carolina?',
                'honesty': 'How honestly have you answered these questions? People depend on the honesty of your answers - if you admit to being dishonest, you will still be paid.'}

print('Processing columns...')
survey_online.columns = [c.replace('\xa0', '') for c in survey_online.columns]
variable_map = {v: k for k, v in variable_map.items()}
survey_online = survey_online[variable_map.keys()].rename(variable_map, axis=1)
survey_online = survey_online.drop(0)
survey_online['survey_method'] = 'Online'

print('Processing age...')
survey_online['birth_year'] = survey_online['birth_year'].astype(float)
survey_online['age'] = (2020 - survey_online['birth_year'].fillna(2020)).astype(float)
survey_online = survey_online.drop('birth_year', axis=1)
survey_online['age'] = survey_online['age'].apply(transform_age)

print('Processing education...')
survey_online['education'] = survey_online['education'].apply(transform_education)

print('Processing ID...')
survey_online['id'] = survey_online['id'].apply(lambda x: 'prolific_{}'.format(hashlib.sha224(x.encode()).hexdigest()))

print('Processing vote method...')
survey_online['vote_method'] = survey_online['vote_method'].apply(process_vote_method)
survey_online['vote_method_plan'] = survey_online['vote_method_plan'].apply(process_vote_method)
survey_online['vote_method'] = survey_online[['vote_method', 'vote_method_plan']].fillna('').apply(sjoin, axis=1).apply(lambda x: x.replace(';', ''))
survey_online = survey_online.drop(['vote_method_plan'], axis=1)

print('Processing Senate vote...')
def transform_senate_vote(pref):
    pref = pref.split(',')[0]
    if pref not in ['Jon Ossoff', 'David Perdue', 'Raphael Warnock', 'Kelly Loeffler', 'Do not intend to vote']:
        return 'Undecided'
    else:
        return pref
survey_online['vote_ossoff_perdue'] = survey_online[['vote_ossoff_perdue', 'vote_ossoff_perdue2']].fillna('').apply(sjoin, axis=1).apply(lambda x: x.replace(';', '')).apply(transform_senate_vote)
survey_online = survey_online.drop(['vote_ossoff_perdue2'], axis=1)
survey_online['vote_warnock_loeffler'] = survey_online[['vote_warnock_loeffler', 'vote_warnock_loeffler2']].fillna('').apply(sjoin, axis=1).apply(lambda x: x.replace(';', '')).apply(transform_senate_vote)
survey_online = survey_online.drop(['vote_warnock_loeffler2'], axis=1)

print('Process income...')
survey_online['income'] = survey_online['income'].apply(transform_income)

print('Process race...')
survey_online['race'] = survey_online['race'].apply(transform_race)

print('Processing 2016 vote...')
survey_online['vote2016'] = survey_online['vote2016'].fillna('Other').apply(transform_2016_vote)

print('Processing 2020 vote...')
survey_online['vote2020'] = survey_online['vote2020'].fillna('Other').apply(transform_2020_vote)
survey_online['rigged_who_actually_won'] = survey_online['rigged_who_actually_won'].fillna('Other').apply(transform_2020_vote)

print('Processing GSS trust...')
survey_online['gss_trust'] = survey_online['gss_trust'].apply(transform_gss_trust).fillna('Don\'t know').apply(lambda x: x.replace('’', '\''))

print('Processing GSS Bible...')
survey_online['gss_bible'] = survey_online['gss_bible'].apply(transform_gss_bible).fillna('Don\'t know').apply(lambda x: x.replace('’', '\''))

print('Process Likert...')
survey_online['factory_farming_ban'] = survey_online['factory_farming_ban'].fillna('Don\'t know').apply(simplify_likert)
survey_online['slaughterhouse_ban'] = survey_online['slaughterhouse_ban'].fillna('Don\'t know').apply(simplify_likert)
survey_online['vegetarian_imperative'] = survey_online['vegetarian_imperative'].fillna('Don\'t know').apply(simplify_likert)
survey_online['birth_control'] = survey_online['birth_control'].fillna('Don\'t know').apply(simplify_likert)
survey_online['gss_spanking'] = survey_online['gss_spanking'].fillna('Don\'t know').apply(simplify_likert)

survey_online['vote_ga_senate'] = survey_online['vote_warnock_loeffler'] + ' & ' + survey_online['vote_ossoff_perdue']

for c in survey_online.columns:
    if c.startswith('social_'):
        print('Processing {}...'.format(c))
        survey_online[c] = survey_online[c].apply(lambda x: 'Yes' if isinstance(x, str) else 'No')

display_columns(survey_online[sorted(list(survey_online.columns))])

Processing columns...
Processing age...
Processing education...
Processing ID...
Processing vote method...
Processing Senate vote...
Process income...
Process race...
Processing 2016 vote...
Processing 2020 vote...
Processing GSS trust...
Processing GSS Bible...
Process Likert...
Processing social_fb...
## age ##
18-34          61.056751
35-54          29.158513
55-64           6.849315
65 or older     2.544031
Under 18        0.391389
Name: age, dtype: float64
-
-
## birth_control ##
Disagree      89.432485
Don't know     6.849315
Agree          3.718200
Name: birth_control, dtype: float64
-
-
## covid_know_dead ##
No        68.369352
Yes       29.273084
Unsure     2.357564
Name: covid_know_dead, dtype: float64
-
-
## covid_positive ##
No        91.944990
Yes        6.286837
Unsure     1.768173
Name: covid_positive, dtype: float64
-
-
## covid_reduced_work ##
Yes       54.813360
No        43.614931
Unsure     1.571709
Name: covid_reduced_work, dtype: float64
-
-
## covid_trouble_bills

In [5]:
print('Drop...')
survey_online = survey_online[survey_online['age'] != 'Under 18']
survey_online = survey_online[survey_online['vote_method'] != '']
survey_online = survey_online[survey_online['lv_eligible'] == 'Yes']
survey_online = survey_online[survey_online['lv_registered'] == 'Yes']
survey_online = survey_online[survey_online['lv_plan'] == 'Yes']
survey_online = survey_online[survey_online['vote_method'] != 'Don\'t know']
survey_online = survey_online[survey_online['vote_ossoff_perdue'] != 'Do not intend to vote']
survey_online = survey_online[survey_online['vote_warnock_loeffler'] != 'Do not intend to vote']
survey_online = survey_online[survey_online['honesty'] != 'Somewhat honestly']
survey_online = survey_online[survey_online['honesty'] != 'Not honestly at all']
survey_online = survey_online[survey_online['loc_county'] != 'I don\'t live in Georgia']
survey_online = survey_online[[c for c in survey_online.columns if 'not_voting' not in c]]
survey_online = survey_online.drop(['lv_eligible', 'lv_plan', 'lv_registered', 'voted', 'honesty'], axis=1)

display_columns(survey_online[sorted(list(survey_online.columns))])

Drop...
## age ##
18-34          57.662338
35-54          31.168831
55-64           8.051948
65 or older     3.116883
Name: age, dtype: float64
-
-
## birth_control ##
Disagree      90.649351
Don't know     5.714286
Agree          3.636364
Name: birth_control, dtype: float64
-
-
## covid_know_dead ##
No        66.493506
Yes       31.948052
Unsure     1.558442
Name: covid_know_dead, dtype: float64
-
-
## covid_positive ##
No        92.207792
Yes        7.012987
Unsure     0.779221
Name: covid_positive, dtype: float64
-
-
## covid_reduced_work ##
Yes       54.285714
No        44.415584
Unsure     1.298701
Name: covid_reduced_work, dtype: float64
-
-
## covid_trouble_bills ##
No        62.239583
Yes       34.375000
Unsure     3.385417
Name: covid_trouble_bills, dtype: float64
-
-
## education ##
Graduated from college        38.181818
Some college, no degree       37.662338
Completed graduate school     12.727273
Graduated from high school    11.168831
Less than high school          0.259

In [6]:
survey_ivr = pd.read_csv('ivr_landline_sample.csv')

variable_map = {'id': 'record id',
                'lv_registered': 'Q1',
                'vote_method': 'Q3',
                'lv_likely': 'Q2',
                'vote2016': 'Q5 ',
                'vote2020': 'Q6 ',
                'vote_ossoff_perdue': 'Q7 ',
                'vote_warnock_loeffler': 'Q8',
                'enthusiasm': 'Q4 ',
                'rigged_who_actually_won': 'Q9',
                'gss_trust': 'Q10',
                'gss_bible': 'Q11',
                'gss_spanking': 'Q12 ',
                'factory_farming_ban': 'Q13 ',
                'covid_trouble_bills': 'Q14 ',
                'social_fb': 'Q15',
                'gender': 'Q18',
                'age': 'Q19',
                'race': 'Q20',
                'education': 'Q21',
                'income': 'Q22',
                'loc_county': 'Q23'}

print('Processing columns...')
survey_ivr.columns = [c.replace('\xa0', '') for c in survey_ivr.columns]
variable_map = {v: k for k, v in variable_map.items()}
survey_ivr = survey_ivr[variable_map.keys()].rename(variable_map, axis=1)
survey_ivr['survey_method'] = 'IVR'

print('Processing ID...')
survey_ivr['id'] = survey_ivr['id'].apply(lambda x: 'wick_{}'.format(hashlib.sha224(str(x).encode()).hexdigest()))

print('Processing education...')
def transform_education2(educ):
    if educ == 'Some college':
        return 'Some college, no degree'
    elif educ == 'Bachelors degree' or educ == 'Some graduate school':
        return 'Graduated from college'
    elif educ == 'High school graduate':
        return 'Graduated from high school'
    elif educ == 'Masters':
        return 'Completed graduate school'
    else:
        return educ
survey_ivr['education'] = survey_ivr['education'].apply(transform_education2)

print('Processing GSS Bible...')
def transform_gss_bible2(bible):
    if bible == 'The bible is the actual word of God':
        return 'Word of God'
    elif bible == 'The Bible is the inspired word of God':
        return 'Inspired word'
    elif bible == 'The Bible is an ancient book of fables':
        return 'Book of fables'
    elif bible == 'I do not know enough' or bible == 'I don’t know':
        return 'I don\'t know'
survey_ivr['gss_bible'] = survey_ivr['gss_bible'].apply(transform_gss_bible2)

print('Processing GSS Trust...')
def transform_gss_trust2(trust):
    if trust == 'Most people can be trusted':
        return 'Can trust'
    elif trust == 'You can\'t be too careful':
        return 'Can\'t be too careful'
    else:
        return trust
survey_ivr['gss_trust'] = survey_ivr['gss_trust'].apply(transform_gss_trust2) 

print('Processing income...')
def transform_income2(income):
    if income == 'Between 50K to 75K':
        return 'Between $50,000 and $74,999'
    elif income == 'Between 15K to 30K' or income == 'Between 50K to 50K':
        return 'Between $15,000 and $49,999'
    elif income == 'Between 75K to 99K':
        return 'Between $75,000 and $99,999'
    elif income == 'Between 100K to 150K':
        return 'Between $100,000 and $150,000'
    elif income == 'Over 150K':
        return 'Over $150,000'
    else:
        return income
survey_ivr['income'] = survey_ivr['income'].apply(transform_income2)

print('Processing county...')
def transform_county(county):
    if county == 'FULTON':
        return 'Fulton County, GA'
    elif county == 'COBB':
        return 'Cobb County, GA'
    elif county == 'DEKALB':
        return 'DeKalb County, GA'
    elif county == 'GWINNETT':
        return 'Gwinnett County, GA'
    else:
        return 'Another county in Georgia'
survey_ivr['loc_county'] = survey_ivr['loc_county'].apply(transform_county)

print('Processing vote...')
def transform_vote2(vote):
    if vote in ['Gary Johnson', 'Jill Stein', 'Another candidate']:
        return 'Other'
    else:
        return vote
survey_ivr['vote2016'] = survey_ivr['vote2016'].apply(transform_vote2)
survey_ivr['vote2020'] = survey_ivr['vote2020'].apply(transform_vote2)

print('Processing FB...')
survey_ivr['social_fb'] = survey_ivr['social_fb'].apply(lambda x: 'No' if x == 'Don\'t know' else x)

print('Processing Likert...')
survey_ivr['factory_farming_ban'] = survey_ivr['factory_farming_ban'].apply(lambda x: 'Don\'t know' if x == 'Unsure' else x.capitalize()).apply(simplify_likert)
survey_ivr['gss_spanking'] = survey_ivr['gss_spanking'].apply(lambda x: 'Don\'t know' if x == 'Unsure' else x.capitalize()).apply(simplify_likert)
survey_ivr['covid_trouble_bills'] = survey_ivr['covid_trouble_bills'].apply(lambda x: 'Unsure' if x == 'Don\'t know' else x)

survey_ivr['vote_ga_senate'] = survey_ivr['vote_warnock_loeffler'] + ' & ' + survey_ivr['vote_ossoff_perdue']

display_columns(survey_ivr[sorted(list(survey_ivr.columns))])

Processing columns...
Processing ID...
Processing education...
Processing GSS Bible...
Processing GSS Trust...
Processing income...
Processing county...
Processing vote...
Processing FB...
Processing Likert...
## age ##
65 or older    51.960784
55-64          28.627451
35-54          14.509804
18-34           4.901961
Name: age, dtype: float64
-
-
## covid_trouble_bills ##
No        66.078431
Yes       27.549020
Unsure     6.372549
Name: covid_trouble_bills, dtype: float64
-
-
## education ##
Graduated from college        28.235294
Some college, no degree       27.450980
Completed graduate school     19.607843
Graduated from high school    19.607843
Less than high school          5.098039
Name: education, dtype: float64
-
-
## enthusiasm ##
More enthusiastic    77.058824
Less enthusiastic    12.058824
About the same       10.882353
Name: enthusiasm, dtype: float64
-
-
## factory_farming_ban ##
Don't know    68.137255
Agree         19.411765
Disagree      12.450980
Name: factory_farming

In [7]:
same_cols = sorted(list(set(survey_ivr.columns) & set(survey_online.columns)))
diff_cols = sorted(list(set(survey_online.columns) - set(survey_ivr.columns)))
for c in diff_cols:
    survey_ivr[c] = 'Not presented'
survey = pd.concat((survey_online[same_cols + diff_cols], survey_ivr[same_cols + diff_cols]))
survey = survey.reset_index(drop=True)

display_columns(survey)

## age ##
65 or older    38.576512
55-64          22.989324
18-34          19.359431
35-54          19.074733
Name: age, dtype: float64
-
-
## birth_control ##
Not presented    72.597865
Disagree         24.839858
Don't know        1.565836
Agree             0.996441
Name: birth_control, dtype: float64
-
-
## covid_know_dead ##
Not presented    72.597865
No               18.220641
Yes               8.754448
Unsure            0.427046
Name: covid_know_dead, dtype: float64
-
-
## covid_positive ##
Not presented    72.597865
No               25.266904
Yes               1.921708
Unsure            0.213523
Name: covid_positive, dtype: float64
-
-
## covid_reduced_work ##
Not presented    72.597865
Yes              14.875445
No               12.170819
Unsure            0.355872
Name: covid_reduced_work, dtype: float64
-
-
## covid_trouble_bills ##
No        65.028490
Yes       29.415954
Unsure     5.555556
Name: covid_trouble_bills, dtype: float64
-
-
## education ##
Graduated from college  

In [8]:
survey.to_csv('responses_processed.csv', index=False)