In [45]:
import pandas as pd

from mlgear.utils import display_columns
from surveyweights import normalize_weights

In [46]:
survey = pd.read_csv('responses_processed_with_default_weights.csv')

## Raw Demographics

In [47]:
demographics = ['loc_county', 'gender', 'race', 'education', 'income', 'age', 'vote2016', 'vote2020']
for var in demographics:
    print('## {} ##'.format(var))
    print(survey[var].value_counts(normalize=True) * 100)
    print('-')
    print('-')

## loc_county ##
Another county in Georgia    72.455516
Fulton County, GA            10.320285
Cobb County, GA               8.327402
Gwinnett County, GA           6.548043
DeKalb County, GA             2.348754
Name: loc_county, dtype: float64
-
-
## gender ##
Female               53.950178
Male                 44.768683
Other                 1.209964
Prefer not to say     0.071174
Name: gender, dtype: float64
-
-
## race ##
White, not Hispanic    68.113879
Black, non-Hispanic    21.637011
Other                   8.469751
Hispanic                1.779359
Name: race, dtype: float64
-
-
## education ##
Graduated from college        30.960854
Some college, no degree       30.249110
Completed graduate school     17.722420
Graduated from high school    17.295374
Less than high school          3.772242
Name: education, dtype: float64
-
-
## income ##
Between $15,000 and $49,999      36.725979
Between $50,000 and $74,999      19.928826
Under $15,000                    16.441281
Between $75,0

## Demographics after weighting

In [48]:
for var in demographics:
    print('## {} ##'.format(var))
    print(survey[var].value_counts(normalize=True) * survey.groupby(var)['weight'].mean() * 100)
    print('-')
    print('-')

## loc_county ##
Another county in Georgia    66.856037
Cobb County, GA               7.199936
DeKalb County, GA             7.144560
Fulton County, GA             9.996708
Gwinnett County, GA           8.802759
dtype: float64
-
-
## gender ##
Female               51.060622
Male                 48.268777
Other                 0.599480
Prefer not to say     0.071121
dtype: float64
-
-
## race ##
Black, non-Hispanic    32.595866
Hispanic                9.897053
Other                   5.497792
White, not Hispanic    52.009290
dtype: float64
-
-
## education ##
Completed graduate school      9.102887
Graduated from college        18.304673
Graduated from high school    30.094212
Less than high school         14.096090
Some college, no degree       28.402138
dtype: float64
-
-
## income ##
Between $100,000 and $150,000    12.068808
Between $15,000 and $49,999      35.246324
Between $50,000 and $74,999      18.006331
Between $75,000 and $99,999      11.599689
Over $150,000                  

## Expected Vote Demographics (Demographics after Weighting + Likely Voter Model)

In [49]:
for var in demographics:
    print('## {} ##'.format(var))
    print(survey[var].value_counts(normalize=True) * survey.groupby(var)['lv_weight'].mean() * 100)
    print('-')
    print('-')

## loc_county ##
Another county in Georgia    69.893732
Cobb County, GA               6.830933
DeKalb County, GA             5.205877
Fulton County, GA             9.341776
Gwinnett County, GA           8.727682
dtype: float64
-
-
## gender ##
Female               50.082035
Male                 49.305544
Other                 0.552167
Prefer not to say     0.060254
dtype: float64
-
-
## race ##
Black, non-Hispanic    32.572761
Hispanic                9.147083
Other                   5.312267
White, not Hispanic    52.967888
dtype: float64
-
-
## education ##
Completed graduate school      8.956373
Graduated from college        17.569730
Graduated from high school    30.169418
Less than high school         15.581855
Some college, no degree       27.722624
dtype: float64
-
-
## income ##
Between $100,000 and $150,000    12.359966
Between $15,000 and $49,999      34.366229
Between $50,000 and $74,999      17.516048
Between $75,000 and $99,999      12.149610
Over $150,000                  

## Demographic by Vote

In [50]:
for var in demographics:
    print('## {} ##'.format(var))
    options = survey[var].unique()
    for option in options:
        print('-')
        print('---- vote choice for "{}"'.format(option))
        survey_ = survey[survey[var] == option]
        survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
        print('------ "{}" 2016 vote'.format(option))
        print(survey_['vote2016'].value_counts(normalize=True) * survey_.groupby('vote2016')['lv_weight'].mean() * 100)
        print('------ "{}" 2020 vote'.format(option))
        print(survey_['vote2020'].value_counts(normalize=True) * survey_.groupby('vote2020')['lv_weight'].mean() * 100)
        print('------ "{}" Ossoff-Perdue vote'.format(option))
        print(survey_['vote_ossoff_perdue'].value_counts(normalize=True) * survey_.groupby('vote_ossoff_perdue')['lv_weight'].mean() * 100)
        print('------ "{}" Warnock-Loeffler vote'.format(option))
        print(survey_['vote_warnock_loeffler'].value_counts(normalize=True) * survey_.groupby('vote_warnock_loeffler')['lv_weight'].mean() * 100)
    print('-')
    print('-')
    print('-')
    print('-')

## loc_county ##
-
---- vote choice for "Another county in Georgia"
------ "Another county in Georgia" 2016 vote
Did not vote        7.544310
Donald Trump       56.708254
Hillary Clinton    32.278083
Other               3.469352
dtype: float64
------ "Another county in Georgia" 2020 vote
Did not vote     1.015949
Donald Trump    59.899854
Joe Biden       37.602152
Other            1.482045
dtype: float64
------ "Another county in Georgia" Ossoff-Perdue vote
David Perdue    56.262463
Jon Ossoff      37.718859
Undecided        6.018678
dtype: float64
------ "Another county in Georgia" Warnock-Loeffler vote
Kelly Loeffler     57.772963
Raphael Warnock    38.354989
Undecided           3.872048
dtype: float64
-
---- vote choice for "Cobb County, GA"
------ "Cobb County, GA" 2016 vote
Did not vote        8.800234
Donald Trump       40.636977
Hillary Clinton    40.294460
Other              10.268329
dtype: float64
------ "Cobb County, GA" 2020 vote
Did not vote     1.167102
Donald Trump    51

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])



------ "Graduated from college" Ossoff-Perdue vote
David Perdue    44.804533
Jon Ossoff      52.357044
Undecided        2.838423
dtype: float64
------ "Graduated from college" Warnock-Loeffler vote
Kelly Loeffler     45.285964
Raphael Warnock    52.499758
Undecided           2.214278
dtype: float64
-
---- vote choice for "Some college, no degree"
------ "Some college, no degree" 2016 vote
Did not vote       13.951526
Donald Trump       47.643812
Hillary Clinton    36.514753
Other               1.889909
dtype: float64
------ "Some college, no degree" 2020 vote
Did not vote     0.681807
Donald Trump    52.541927
Joe Biden       45.740231
Other            1.036035
dtype: float64
------ "Some college, no degree" Ossoff-Perdue vote
David Perdue    48.815528
Jon Ossoff      47.093053
Undecided        4.091418
dtype: float64
------ "Some college, no degree" Warnock-Loeffler vote
Kelly Loeffler     48.578054
Raphael Warnock    47.101878
Undecided           4.320067
dtype: float64
-
---- vote 

dtype: float64
------ "Donald Trump" 2020 vote
Did not vote     0.135415
Donald Trump    96.647801
Joe Biden        3.173963
Other            0.042822
dtype: float64
------ "Donald Trump" Ossoff-Perdue vote
David Perdue    91.133935
Jon Ossoff       4.974693
Undecided        3.891373
dtype: float64
------ "Donald Trump" Warnock-Loeffler vote
Kelly Loeffler     92.067464
Raphael Warnock     6.018063
Undecided           1.914473
dtype: float64
-
---- vote choice for "Other"
------ "Other" 2016 vote
Other    100.0
dtype: float64
------ "Other" 2020 vote
Did not vote     3.932513
Donald Trump    45.242320
Joe Biden       31.933649
Other           18.891518
dtype: float64
------ "Other" Ossoff-Perdue vote
David Perdue    36.798291
Jon Ossoff      30.991104
Undecided       32.210605
dtype: float64
------ "Other" Warnock-Loeffler vote
Kelly Loeffler     35.290423
Raphael Warnock    32.331534
Undecided          32.378044
dtype: float64
-
-
-
-
## vote2020 ##
-
---- vote choice for "Joe Biden"


## Vote by Demographic

In [51]:
for vote in ['vote_ossoff_perdue', 'vote_warnock_loeffler']:
    for choice in survey[vote].unique():
        for var in demographics:
            print('{} = {}, broken down by {}'.format(vote, choice, var))
            survey_ = survey[survey[vote] == choice]
            survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
            print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['lv_weight'].mean() * 100)
            print('-')
        print('-')
        print('-')

vote_ossoff_perdue = Jon Ossoff, broken down by loc_county
Another county in Georgia    57.861352
Cobb County, GA               6.997854
DeKalb County, GA             9.861471
Fulton County, GA            14.472153
Gwinnett County, GA          10.807170
dtype: float64
-
vote_ossoff_perdue = Jon Ossoff, broken down by gender
Female               53.860782
Male                 45.607781
Other                 0.399192
Prefer not to say     0.132245
dtype: float64
-
vote_ossoff_perdue = Jon Ossoff, broken down by race
Black, non-Hispanic    58.540798
Hispanic                7.415380
Other                   6.871575
White, not Hispanic    27.172247
dtype: float64
-
vote_ossoff_perdue = Jon Ossoff, broken down by education
Completed graduate school     11.143768
Graduated from college        20.189799
Graduated from high school    25.227897
Less than high school         14.784687
Some college, no degree       28.653850
dtype: float64
-
vote_ossoff_perdue = Jon Ossoff, broken down by income
B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])


Did not vote        4.649321
Donald Trump       86.545653
Hillary Clinton     6.162269
Other               2.642757
dtype: float64
-
vote_warnock_loeffler = Kelly Loeffler, broken down by vote2020
Did not vote     1.026240
Donald Trump    95.947970
Joe Biden        2.921221
Other            0.104569
dtype: float64
-
-
-


## Demographic by Likely Voter

In [62]:
demographics += ['vote_warnock_loeffler', 'vote_ossoff_perdue']
for var in demographics:
    print('## {} ##'.format(var))
    options = survey[var].unique()
    for option in options:
        print('---- likely voter breakdown for "{}"'.format(option))
        survey_ = survey[survey[var] == option]
        survey_['weight'] = normalize_weights(survey_['weight'])
        print(survey_['lv_likely'].value_counts(normalize=True) * survey_.groupby('lv_likely')['weight'].mean() * 100)
        print('-')

## loc_county ##
---- likely voter breakdown for "Another county in Georgia"
Already voted      56.603442
Likely              1.880269
Somewhat likely     1.251131
Very likely        41.729996
dtype: float64
-
---- likely voter breakdown for "Cobb County, GA"
Already voted      52.755148
Likely              0.571689
Somewhat likely     1.356616
Very likely        38.127883
dtype: float64
-
---- likely voter breakdown for "DeKalb County, GA"
Likely              0.692435
Somewhat likely     2.480260
Very likely        90.045648
dtype: float64
-
---- likely voter breakdown for "Fulton County, GA"
Already voted      38.123132
Likely              0.533236
Somewhat likely     0.728944
Very likely        59.080858
dtype: float64
-
---- likely voter breakdown for "Gwinnett County, GA"
Already voted      38.347387
Likely              2.377669
Somewhat likely     0.632613
Very likely        57.275604
dtype: float64
-
## gender ##
---- likely voter breakdown for "Female"
Already voted      44.325

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['weight'] = normalize_weights(survey_['weight'])


## Likely Voter by Demographic

In [53]:
for lvx in sorted(survey['lv_index'].unique())[::-1]:
    for var in demographics:
        print('lv_index = {}, broken down by {}'.format(lvx, var))
        survey_ = survey[survey['lv_index'] == lvx]
        survey_['weight'] = normalize_weights(survey_['weight'])
        print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['weight'].mean() * 100)
        print('-')

lv_index = 0.83, broken down by loc_county
Another county in Georgia    71.115278
Cobb County, GA               6.700482
DeKalb County, GA             4.297786
Fulton County, GA             9.165652
Gwinnett County, GA           8.720802
dtype: float64
-
lv_index = 0.83, broken down by gender
Female    50.120703
Male      49.599254
Other      0.280042
dtype: float64
-
lv_index = 0.83, broken down by race


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['weight'] = normalize_weights(survey_['weight'])


Black, non-Hispanic    32.988733
Hispanic                8.547806
Other                   5.062441
White, not Hispanic    53.401021
dtype: float64
-
lv_index = 0.83, broken down by education
Completed graduate school      8.850101
Graduated from college        17.134697
Graduated from high school    30.273969
Less than high school         16.482076
Some college, no degree       27.259156
dtype: float64
-
lv_index = 0.83, broken down by income
Between $100,000 and $150,000    12.592045
Between $15,000 and $49,999      33.840410
Between $50,000 and $74,999      17.311934
Between $75,000 and $99,999      12.293846
Over $150,000                    10.207852
Under $15,000                    13.753913
dtype: float64
-
lv_index = 0.83, broken down by age
18-34          21.628472
35-54          36.652094
55-64          19.047803
65 or older    22.671631
dtype: float64
-
lv_index = 0.83, broken down by vote2016
Did not vote        8.361546
Donald Trump       49.463087
Hillary Clinton    38.6033

dtype: float64
-
lv_index = 0.23, broken down by vote2020
Did not vote     1.081898
Donald Trump    14.958645
Joe Biden       82.977182
Other            0.982275
dtype: float64
-
lv_index = 0.23, broken down by vote_warnock_loeffler
Kelly Loeffler     12.907154
Raphael Warnock    85.553047
Undecided           1.539799
dtype: float64
-
lv_index = 0.23, broken down by vote_ossoff_perdue
David Perdue    13.088316
Jon Ossoff      85.497349
Undecided        1.414335
dtype: float64
-
lv_index = 0.11, broken down by loc_county
Another county in Georgia    90.265063
Gwinnett County, GA           9.734937
dtype: float64
-
lv_index = 0.11, broken down by gender
Female    100.0
dtype: float64
-
lv_index = 0.11, broken down by race
Black, non-Hispanic    28.632026
White, not Hispanic    71.367974
dtype: float64
-
lv_index = 0.11, broken down by education
Graduated from high school    83.645195
Some college, no degree       16.354805
dtype: float64
-
lv_index = 0.11, broken down by income
Between $

## Demographic by Enthusiasm

In [54]:
for var in demographics:
    print('## {} ##'.format(var))
    options = survey[var].unique()
    for option in options:
        print('---- enthusiasm breakdown for "{}"'.format(option))
        survey_ = survey[survey[var] == option]
        survey_['weight'] = normalize_weights(survey_['weight'])
        print(survey_['enthusiasm'].value_counts(normalize=True) * survey_.groupby('enthusiasm')['weight'].mean() * 100)
        print('-')

## loc_county ##
---- enthusiasm breakdown for "Another county in Georgia"
About the same       14.904883
Less enthusiastic    14.671138
More enthusiastic    70.423980
dtype: float64
-
---- enthusiasm breakdown for "Cobb County, GA"
About the same       23.944060
Less enthusiastic    20.520599
More enthusiastic    55.535341
dtype: float64
-
---- enthusiasm breakdown for "DeKalb County, GA"
About the same       31.455379
Less enthusiastic     1.411728
More enthusiastic    67.132892
dtype: float64
-
---- enthusiasm breakdown for "Fulton County, GA"
About the same       14.646203
Less enthusiastic     9.830627
More enthusiastic    75.523171
dtype: float64
-
---- enthusiasm breakdown for "Gwinnett County, GA"
About the same       12.258662
Less enthusiastic     9.158847
More enthusiastic    78.582491
dtype: float64
-
## gender ##
---- enthusiasm breakdown for "Female"
About the same       18.175587
Less enthusiastic    15.511048
More enthusiastic    66.313365
dtype: float64
-
---- enthusia

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['weight'] = normalize_weights(survey_['weight'])


## Enthusiasm by Demographic

In [55]:
for enth in sorted(survey['enthusiasm'].unique())[::-1]:
    for var in demographics:
        print('enthusiasm = {}, broken down by {}'.format(enth, var))
        survey_ = survey[survey['enthusiasm'] == enth]
        survey_['weight'] = normalize_weights(survey_['weight'])
        print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['weight'].mean() * 100)
        print('-')

enthusiasm = More enthusiastic, broken down by loc_county
Another county in Georgia    66.931290
Cobb County, GA               5.684157
DeKalb County, GA             6.818343
Fulton County, GA            10.732607
Gwinnett County, GA           9.833602
dtype: float64
-
enthusiasm = More enthusiastic, broken down by gender
Female    48.134357
Male      51.056523
Other      0.809119
dtype: float64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['weight'] = normalize_weights(survey_['weight'])



-
enthusiasm = More enthusiastic, broken down by race
Black, non-Hispanic    33.866618
Hispanic                8.721566
Other                   5.992500
White, not Hispanic    51.419316
dtype: float64
-
enthusiasm = More enthusiastic, broken down by education
Completed graduate school      9.390656
Graduated from college        19.044358
Graduated from high school    28.612265
Less than high school         15.886583
Some college, no degree       27.066138
dtype: float64
-
enthusiasm = More enthusiastic, broken down by income
Between $100,000 and $150,000    14.099337
Between $15,000 and $49,999      34.654563
Between $50,000 and $74,999      16.649479
Between $75,000 and $99,999      10.240385
Over $150,000                    10.452739
Under $15,000                    13.903497
dtype: float64
-
enthusiasm = More enthusiastic, broken down by age
18-34          27.654260
35-54          33.338269
55-64          17.621859
65 or older    21.385612
dtype: float64
-
enthusiasm = More enthusi

## Demographic by Vote Method

In [56]:
for var in demographics:
    print('## {} ##'.format(var))
    options = survey[var].unique()
    for option in options:
        print('---- vote method breakdown for "{}"'.format(option))
        survey_ = survey[survey[var] == option]
        survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
        print(survey_['vote_method'].value_counts(normalize=True) * survey_.groupby('vote_method')['lv_weight'].mean() * 100)
        print('-')

## loc_county ##
---- vote method breakdown for "Another county in Georgia"
Absentee by mail               32.861080
Early vote in person           48.060987
I will vote on election day    19.077933
dtype: float64
-
---- vote method breakdown for "Cobb County, GA"
Absentee by mail               48.140608
Early vote in person           35.986649
I will vote on election day    15.872743
dtype: float64
-
---- vote method breakdown for "DeKalb County, GA"
Absentee by mail               54.774018
Early vote in person           28.778601
I will vote on election day    16.447381
dtype: float64
-
---- vote method breakdown for "Fulton County, GA"
Absentee by mail               31.829426
Early vote in person           48.055665
I will vote on election day    20.114909
dtype: float64
-
---- vote method breakdown for "Gwinnett County, GA"
Absentee by mail               37.415483
Early vote in person           42.350396
I will vote on election day    20.234121
dtype: float64
-
## gender ##
---- vo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])


Absentee by mail               30.860907
Early vote in person           52.614419
I will vote on election day    16.524674
dtype: float64
-
## education ##
---- vote method breakdown for "Completed graduate school"
Absentee by mail               40.803575
Early vote in person           47.556312
I will vote on election day    11.640113
dtype: float64
-
---- vote method breakdown for "Graduated from high school"
Absentee by mail               42.645985
Early vote in person           38.393593
I will vote on election day    18.960422
dtype: float64
-
---- vote method breakdown for "Graduated from college"
Absentee by mail               27.658425
Early vote in person           51.982513
I will vote on election day    20.359062
dtype: float64
-
---- vote method breakdown for "Some college, no degree"
Absentee by mail               26.291208
Early vote in person           51.076579
I will vote on election day    22.632213
dtype: float64
-
---- vote method breakdown for "Less than high schoo

## Vote Method by Demographic

In [59]:
for method in sorted(survey['vote_method'].unique())[::-1]:
    for var in demographics:
        print('vote method = "{}", broken down by {}'.format(method, var))
        survey_ = survey[survey['vote_method'] == method]
        survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
        print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['weight'].mean() * 100)
        print('-')

vote method = "I will vote on election day", broken down by loc_county
Another county in Georgia    90.736556
Cobb County, GA               7.488519
DeKalb County, GA             5.723386
Fulton County, GA            12.449200
Gwinnett County, GA          11.769604
dtype: float64
-
vote method = "I will vote on election day", broken down by gender
Female    62.718876
Male      65.448388
dtype: float64
-
vote method = "I will vote on election day", broken down by race
Black, non-Hispanic    20.831099
Hispanic               11.197176
Other                   5.948250
White, not Hispanic    90.190740
dtype: float64
-
vote method = "I will vote on election day", broken down by education
Completed graduate school      7.078958
Graduated from college        24.046423
Graduated from high school    38.270115
Less than high school         15.832429
Some college, no degree       42.939339
dtype: float64
-
vote method = "I will vote on election day", broken down by income
Between $100,000 and $150

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
