# Preprocess the dataset.

In [36]:
import pandas as pd
df = pd.read_csv('CCES_2018.csv', low_memory=False)

print(df.shape) 

(60000, 524)


In [37]:
#create a smaller dataframe

df1 = df[['CC18_300d_1', 'CC18_300d_2', 'CC18_300d_3', 'CC18_300d_4', 'CC18_300d_5', 'birthyr', 'CC18_417c', 'educ', 'ideo5', 'newsint', 'faminc_new', 'pid3', 'gender', 'sexuality', 'trans', 'votereg', 'race', 'hispanic', 'employ', 'internethome', 'internetwork', 'votereg_post', 'votereg_f_post','CC18_300_1', 'CC18_300_2', 'CC18_300_3', 'CC18_300_4', 'CC18_300_5', 'CC18_300_6', 'CC18_300b', 'CC18_300c', 'CC18_309a', 'CC18_309b', 'CC18_309c', 'CC18_309d', 'CC18_310a', 'CC18_310b', 'CC18_310c', 'CC18_310d', 'CC18_316', 'CC18_317', 'CC18_318a', 'CC18_350', 'CC18_401', 'CC18_417a_1', 'CC18_417a_2', 'CC18_417a_3', 'CC18_417a_4', 'CC18_417a_5', 'CC18_417a_6', 'CC18_417a_7', 'CC18_417a_8', 'CC18_418a', 'CC18_417b_1', 'CC18_417b_2', 'CC18_417b_3', 'CC18_417b_4', 'CC18_417b_5', 'CC18_417b_6', 'CC18_417b_7', 'CC18_417b_8', 'CC18_417b_9', 'CL_matched', 'CL_voter_status', 'CL_state', 'CL_party', 'CL_2018gvm', 'CL_2018pep', 'CL_2018pvm']]

print(df1.shape) #great. much more manageable

(60000, 69)


Target Variable:
    
    Social media:
    CC18_300d_1
    CC18_300d_2
    CC18_300d_3
    CC18_300d_4
    CC18_300d_5

Additional variables of interest:

Demographics: 
    birthyr
    gender
    sexuality
    trans
    educ
    votereg
    race
    hispanic
    employ
    internethome
    internetwork
    pid3
    ideo5
    newsint
    faminc_new
    votereg_post
    voteret_f_post (this is zip code)
    
Media use:
    CC18_300_1
    CC18_300_2
    CC18_300_3
    CC18_300_4
    CC18_300_5
    CC18_300_6
    CC18_300b
    CC18_300c
   
Political knowledge:
    CC18_309a
    CC18_309b
    CC18_309c
    CC18_309d
    CC18_310a
    CC18_310b
    CC18_310c
    CC18_310d
    
    
Voting:
    CC18_316
    CC18_317
    CC18_318a
    
Intend to vote:
    CC18_350
    CC18_401
    
Other political activity:
    CC18_417a_1
    CC18_417a_2
    CC18_417a_3
    CC18_417a_4
    CC18_417a_5
    CC18_417a_6
    CC18_417a_7
    CC18_417a_8
    
Donating Money:
    CC18_417b_1
    CC18_417b_2
    CC18_417b_3
    CC18_417b_4
    CC18_417b_5
    CC18_417b_6
    CC18_417b_7
    CC18_417b_8
    CC18_417b_9
    CC18_417c
    
Run for office:
    CC18_418a
    
Verified voting: (there will be a lot of missing data here)
    CL_matched
    CL_voter_status
    CL_state
    CL_party
    CL_2018gvm
    CL_2018pep
    CL_2018pvm
    
There are also contextual variables, listing the names of politicians respondent was asked about (see page 103 of codebook).
    
    

In [38]:
# Continuous:
#     birthyr
#     CC18_417c

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
print(scaler.fit_transform(df1[['birthyr', 'CC18_417c']]))


[[ 0.1733919          nan]
 [-0.55697888         nan]
 [-1.00643782         nan]
 ...
 [ 1.35322161         nan]
 [-0.61316125         nan]
 [ 1.29703924         nan]]


In [39]:
# Ordinal:
#     educ
#     ideo5
#     newsint
#     faminc_new

from sklearn.preprocessing import OrdinalEncoder

df1.educ.unique()
edu_cats=['No HS','High school graduate', 'Some college','2-year', '4-year', 'Post-grad']
enc_edu = OrdinalEncoder(categories = [edu_cats])
enc_edu.fit(df[['educ']])


OrdinalEncoder(categories=[['No HS', 'High school graduate', 'Some college',
                            '2-year', '4-year', 'Post-grad']],
               dtype=<class 'numpy.float64'>)

In [6]:
df1.ideo5.unique()

#have to deal with missing values-turn them into an actual category instead of NaN


array(['Not sure', 'Conservative', 'Liberal', 'Moderate',
       'Very conservative', 'Very liberal', nan], dtype=object)

In [40]:
from sklearn.impute import SimpleImputer
import numpy as np
simp_constant = SimpleImputer(fill_value='missing',
        missing_values=np.nan, strategy='constant')


In [41]:
df1['ideo5'] = simp_constant.fit_transform(df1[['ideo5']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
#categorical columns where I can replace NaN with 'missing'

cat_columns = ['CC18_300d_1', 'CC18_300d_2', 'CC18_300d_3', 'CC18_300d_4',
       'CC18_300d_5', 'educ', 'ideo5', 'newsint',
       'faminc_new', 'pid3', 'gender', 'sexuality', 'trans', 'votereg', 'race',
       'hispanic', 'employ', 'internethome', 'internetwork', 'votereg_post',
       'votereg_f_post', 'CC18_300_1', 'CC18_300_2', 'CC18_300_3',
       'CC18_300_4', 'CC18_300_5', 'CC18_300_6', 'CC18_300b', 'CC18_300c',
       'CC18_309a', 'CC18_309b', 'CC18_309c', 'CC18_309d', 'CC18_310a',
       'CC18_310b', 'CC18_310c', 'CC18_310d', 'CC18_316', 'CC18_317',
       'CC18_318a', 'CC18_350', 'CC18_401', 'CC18_417a_1', 'CC18_417a_2',
       'CC18_417a_3', 'CC18_417a_4', 'CC18_417a_5', 'CC18_417a_6',
       'CC18_417a_7', 'CC18_417a_8', 'CC18_418a', 'CC18_417b_1', 'CC18_417b_2',
       'CC18_417b_3', 'CC18_417b_4', 'CC18_417b_5', 'CC18_417b_6',
       'CC18_417b_7', 'CC18_417b_8', 'CC18_417b_9', 'CL_matched',
       'CL_voter_status', 'CL_state', 'CL_party', 'CL_2018gvm', 'CL_2018pep',
       'CL_2018pvm']

In [43]:
for column in cat_columns:
    df1[column] = simp_constant.fit_transform(df1[[column]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable

In [44]:
df1[df1[cat_columns].isnull().any(axis=1)]

Unnamed: 0,CC18_300d_1,CC18_300d_2,CC18_300d_3,CC18_300d_4,CC18_300d_5,birthyr,CC18_417c,educ,ideo5,newsint,...,CC18_417b_7,CC18_417b_8,CC18_417b_9,CL_matched,CL_voter_status,CL_state,CL_party,CL_2018gvm,CL_2018pep,CL_2018pvm


In [49]:
# Onehot encoding: 

#     pid3 
#     gender
#     sexuality
#     trans
#     votereg
#     race
#     hispanic
#     employ
#     internethome
#     internetwork
#     votereg_post
#     voteret_f_post
#     CC18_300_1
#     CC18_300_2
#     CC18_300_3
#     CC18_300_4
#     CC18_300_5
#     CC18_300_6
#     CC18_300b
#     CC18_300c
#     CC18_309a
#     CC18_309b
#     CC18_309c
#     CC18_309d
#     CC18_310a
#     CC18_310b
#     CC18_310c
#     CC18_310d
#     CC18_316
#     CC18_317
#     CC18_318a
#     CC18_350
#     CC18_401
#     CC18_417a_1
#     CC18_417a_2
#     CC18_417a_3
#     CC18_417a_4
#     CC18_417a_5
#     CC18_417a_6
#     CC18_417a_7
#     CC18_417a_8
#     CC18_417b_1
#     CC18_417b_2
#     CC18_417b_3
#     CC18_417b_4
#     CC18_417b_5
#     CC18_417b_6
#     CC18_417b_7
#     CC18_417b_8
#     CC18_417b_9
#     CC18_418a
#     CL_matched
#     CL_voter_status
#     CL_state
#     CL_party
#     CL_2018gvm
#     CL_2018pep
#     CL_2018pvm

from sklearn.preprocessing import OneHotEncoder

import numpy as np


print(df1.pid3.unique())


enc = OneHotEncoder(sparse=False,handle_unknown='ignore')

#demographics
df_oh_demo = enc.fit_transform(df1[['pid3', 'gender', 'sexuality', 'trans', 'votereg', 'race', 'hispanic', 'employ', 'internethome', 'internetwork', 'votereg_post', 'votereg_f_post']])
print(df_oh_demo)
print('feature names:',enc.get_feature_names())

#media use
df_oh_media = enc.fit_transform(df1[['CC18_300_1', 'CC18_300_2', 'CC18_300_3', 'CC18_300_4', 'CC18_300_5', 'CC18_300_6', 'CC18_300b', 'CC18_300c']])
print(df_oh_media)
print('feature names:',enc.get_feature_names())

#political knowledge
df_oh_know = enc.fit_transform(df1[['CC18_309a', 'CC18_309b', 'CC18_309c', 'CC18_309d', 'CC18_310a', 'CC18_310b', 'CC18_310c', 'CC18_310d']])
print(df_oh_know)
print('feature names:',enc.get_feature_names())

#voting

df_oh_vote = enc.fit_transform(df1[['CC18_316', 'CC18_317', 'CC18_318a', 'CC18_350', 'CC18_401']])
print(df_oh_vote)
print('feature names:',enc.get_feature_names())

#political activity

df_oh_acts = enc.fit_transform(df1[['CC18_417a_1', 'CC18_417a_2', 'CC18_417a_3', 'CC18_417a_4', 'CC18_417a_5', 'CC18_417a_6', 'CC18_417a_7', 'CC18_417a_8', 'CC18_418a']])
print(df_oh_acts)
print('feature names:',enc.get_feature_names())

#donate
df_oh_donate = enc.fit_transform(df1[['CC18_417b_1', 'CC18_417b_2', 'CC18_417b_3', 'CC18_417b_4', 'CC18_417b_5', 'CC18_417b_6', 'CC18_417b_7', 'CC18_417b_8', 'CC18_417b_9']])
print(df_oh_donate)
print('feature names:',enc.get_feature_names())

#verified voting
df_oh_CLvote = enc.fit_transform(df1[['CL_matched', 'CL_voter_status', 'CL_state', 'CL_party', 'CL_2018gvm', 'CL_2018pep', 'CL_2018pvm']])
print(df_oh_CLvote)
print('feature names:',enc.get_feature_names())




['Independent' 'Republican' 'Democrat' 'Other' 'Not sure' 'missing']
[[0. 1. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 ...
 [1. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]]
feature names: ['x0_Democrat' 'x0_Independent' 'x0_Not sure' 'x0_Other' 'x0_Republican'
 'x0_missing' 'x1_Female' 'x1_Male' 'x2_Bisexual' 'x2_Gay man'
 'x2_Heterosexual / straight' 'x2_Lesbian / gay woman' 'x2_Other'
 'x2_Prefer not to say' 'x2_missing' 'x3_No' 'x3_Prefer not to say'
 'x3_Yes' 'x3_missing' "x4_Don't know" 'x4_No' 'x4_Yes' 'x5_Asian'
 'x5_Black' 'x5_Hispanic' 'x5_Middle Eastern' 'x5_Mixed'
 'x5_Native American' 'x5_Other' 'x5_White' 'x6_No' 'x6_Yes' 'x6_missing'
 'x7_Full-time' 'x7_Homemaker' 'x7_Other' 'x7_Part-time'
 'x7_Permanently disabled' 'x7_Retired' 'x7_Student'
 'x7_Temporarily laid off' 'x7_Unemployed' 'x7_missing' 'x8_Broadband'
 'x8_Dial-up' 'x8_None' 'x8_missing' 'x9_Broadband' 'x9_Dial-up' 'x9_None'
 'x9_missing' "x10_Don't know" 'x10_