# Data Carpentry

This notebook goes through the process of ingesting the raw data, renaming columns to human-readable values, adding a couple of flags to be used later, and categorizing responses in logical groupings.

Source of data:

https://www.icpsr.umich.edu/web/ICPSR/studies/37023

# Part I - Initial data ingestion

This section covers ingesting the data, renaming the columns, dropping unnecessary/unused columns, and adding a couple of useful flags.

In [None]:
import pandas as pd
# import pwd, os

path= '/dsa/groups/casestudy2021su/group_3'

# We have a CSV file that translates the study's variable names (Q1, Q2, etc) into easier-to-understand terms
# Here we create a mapper dict that holds the translations
# cnames = pd.read_csv (f'{path}/ColNames.csv')

cnames = pd.read_csv (f'{path}/ColNames.csv', sep =',')

colNames = dict(zip(cnames.to_dict('list')['Var Name'], cnames.to_dict('list')['DF Name']))

In [None]:
# Ingest data
data = pd.read_table (f'{path}/37023-0003-Data.tsv', engine='python') 

In [None]:
# Rename using mapper dict
data=data.rename (mapper=colNames, axis=1)

In [None]:
# Remove columns not in colNames mapper
data = data.drop(data.columns.difference(colNames.values()), axis=1)

In [None]:
# Add flag indicating if respondent was eligible to vote (based on age)

data['eligibleToVote'] = data['age'].map(lambda x: 1 if x >= 18 else 0)

In [None]:
# Add Voted binary variable
data['Voted'] = data['pyVoted'].map(lambda x: 1 if x >= 2 else 0)

# Part II - Grouping & Categorizing responses

In the sections below I will be 'grouping' numeric responses within the dataset into more easy to understand categories as determined by the group.

In [None]:
# Function used to group responses into named categories
def groupResponses (df, cols, mapperDict, naVal = [], naCat = 'N/A'):
    """ 
    Groups responses into categories based on the mapperDict
    @ params:
        df                      - required    : dataframe with columns to be mapped
        cols                    - required    : dictionary containing one or more columns that the groupings apply to along
            with the name of the new column to hold the grouping names as {old: new}.
        mapperDict              - required    : Dictionary containing mapping definition as response: categoryName.  
            Should be defined such that each key is an integer representing the greatest value for that categoryName. 
            For example, if values of 1 - 2 should be categorized as "neg", the dictionary should contain 2: 'neg'
            The dictionary doesn't need to be in any particular order
        naVal                   - optional    : Response value(s) (as a list) that should be categorized as "N/A"
            If this is specified, only the values in the naVal list will be categorized as "N/A"
            Otherwise, all values not mapped in the mapperDict will be categorized as "N/A"
        naCat                   - optional    : Label for the N/A Category.  Default is "N/A"
        
    """
    
    prevVal = None
    mDict = {}
    # First we go through the mapperDict in order by key and fill in the blanks so all values are accounted for
    for key in sorted (mapperDict.keys()):
        if prevVal is None:
            rng = range (key+1)
        else:
            rng = range (prevVal, key+1)
    
        for i in rng:
            mDict [i] = mapperDict[key]
    
        prevVal = key + 1
    
    # Add the specified naVals (if any) to the mapperDict
    for val in naVal:
        mDict[val] = naCat
    
    # Create a new grouped column based on mapping the values from the mapper dict.
    # Some values are strings (objects), and some are just ints. So we do some casting to int to make sure the mapping works.
    for oldCol, newCol in cols.items():
        df[newCol] = df[oldCol].map(lambda x: mDict[int(x)] if (type(x) == int or x.isdigit()) and int(x) in mDict.keys() else naCat)

    # If no naVals were specified, fill in everything not covered by the mapperDict (which would now be NaN) with the specified
    # naCat value.
    if not (naVal):
        for newCol in cols.values():
            df[newCol] = df[newCol].fillna(naCat)
    

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2- neg, 3- neutral, 4/5- post, 99 - NA

UGcols = ['politicsInfluenceSelf',
        'politicsInfluenceLocal',
        'politicsFunImportance',
        'politicsEngagementOccassional',
        'politicsInvovledOnlyImportant',
        'peopleTrusted',
        'peopleTakeAdvantage',
        'peopleFair',
        'newsUnbiased',
        'newsAccurate',
        'newsTrustworthy',
        'selfUnderstandPolitics',
        'selfMoreInvolvedPolitics',
        'selfPoliticsTooComplicated',
        'selfInternetFindPoliticalInfo',
        'selfInternetDiscussPolitics',
        'selfInternetExpressConcern',
        'govtHelpVulnerable',
        'govtBusinessRegulation',
        'govtPoorDependence',
        'immigrationThreatenTradValues',
        'gayMarriage',
        'abortionLegal',
        'protestNormal',
        'protestEffective',
        'protestNotRepresentative',
        'protestIllegal']

cols = {col: col+'_Groups' for col in UGcols}

# For these columns, 1-2 = neg, 3 = neutral, and 5 = pos
groups = {2: 'neg', 3: 'neutral', 5: 'pos'}

groupResponses (df = data, 
                cols = cols, 
                mapperDict = groups,  
                naVal = [99],  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1- neg, 2/3/4- pos
UGcols1 = ['pyRaisedMoneyForCharity',
        'pyDonatedMoneyForCause',
        'pySignedUpForCauseInfo',
        'pyContacedPolitican',
        'pyCommentedNews',
        'pyVoted',
        'pyEncourageVote',
        'pyDisplayedCauseSymbol',
        'pyBoughtForCause',
        'pyBoycottedForCause',
        'pySignedPetition',
        'pyAttendedRally',
        'pyDiscussedPoliticsFriendsFamily']

cols1 = {col: col+'_Groups' for col in UGcols1}

 # For these columns, 1 = neg, 2/3/4 = pos
groups = {1: 'neg', 4: 'pos'}

groupResponses (df = data, 
                cols = cols1, 
                mapperDict = groups,  
                naVal = [99],  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2/3- pos, 4- neg, 9- n/a

UGcols2 = ['videoGamePlayingGamer',
         'videoGamePlayingHelpOthers',
         'videoGamePlayingLearnSocietyProblems',
         'videoGamePlayingMoralEthicalIssues']

cols2 = {col: col+'_Groups' for col in UGcols2}

 # For these columns, 1/2/3 = pos, 4 = neg, 9 = n/a
groups2 = {3: 'pos', 4: 'neg', 9: 'NotMe'}

groupResponses (df = data, 
                cols = cols2, 
                mapperDict = groups2,
                naVal = [99],
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2- pos, 3- neg
UGcols3 = ['groupPoliticalIssueInternetRole',
         'groupSocialIssueInternetRole',
         'groupLocalCommunityInternetRole',
         'groupPoliticanInternetRole',
         'groupElectionCampaignInternetRole',
         'internetUseSearchEngine',
         'internetUseVideos',
         'internetUseTVShows',
         'internetUseBoughtThings',
         'internetUseTravel',
         'internetUseHealthStandard',
         'internetUseHealthDifficult',
         'internetUseSocialNetwork',
         'internetUseShareOwnWork',
         'internetUseShareOtherWork',
         'internetUseRemixFoundWork',
         'internetUsePayFine',
         'internetUseGovernmentBenefits',
         'internetUseGovernmentForms']

cols3 = {col: col+'_Groups' for col in UGcols3}

 # For these columns, 1/2 = pos, 3 = neg
groups = {2: 'pos', 3: 'neg'}

groupResponses (df = data, 
                cols = cols3, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- currently in high school, 2- did not complete, etc.

UGcols4 = ['USAEducation']

cols4 = {col: col+'_Groups' for col in UGcols4}

 # For these columns, 1 = InHighSchool, 2 = HighSchool, 3 = SomeCollege, 4 = 2YearCollege,
 # 5 = 4YearCollege , 6 = Masters , 7 = Doctoral , 8 = Professional(JD/MD) , 9 = Other,
 # 99 = IDK   
    
groups = {1: 'InHighSchool', 2: 'HighSchoolGrad', 3:'NotHighSchoolGrad', 4:'SomeCollege',
          5:'2YearCollege',6:'4YearCollege',7:'Masters',8:'Doctoral',
          9:'Professional(JD/MD)',10: 'Other',99: 'IDK'}

groupResponses (df = data, 
                cols = cols4, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- democrat, 2- republican, 3- independent, 4- something else, 5- none, 99- don't know


UGcols5 = ['USAPoliticalParty']

cols5 = {col: col+'_Groups' for col in UGcols5}

 # For these columns, 1 = Dem, 2 = Rep, 3 = Ind, 4 = SomethingElse,
 # 5 = None, 99 = IDK   
    
groups = {1: 'Dem', 2: 'Rep', 3:'Ind', 4:'SomethingElse',5:'None', 99: 'IDK'}

groupResponses (df = data, 
                cols = cols5, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2- neg, 3/4- pos
UGcols6 = ['localNewsInterest',
           'nationalNewsInterest',
           'politicalNewsInterest',
           'internationalNewsInterest',
           'facebookLearnEvents',
           'facebookFollowPoliticalLinks',
           'facebookLearnEventsBeforeNews',
           'facebookLearnEventsDeeper',
           'facebookLearnOpposingViewpoints',
           'facebookPostPoliticalLinks',
           'facebookPostPoliticalOpinions',
           'facebookEncouragePoliticalAction',
           'facebookEncourageVote',
           'facebookRepostPoliticalMaterial',
           'facebookLikePoliticalMaterial',
           'twitterUseReadNewsPolitics',
           'twitterUseShareNewsPolitics',
           'twitterUseDiscussNewsPolitics']

cols6 = {col: col+'_Groups' for col in UGcols6}

 # For these columns, 1/2 = neg, 3 = pos
groups = {2: 'neg', 4: 'pos'}

groupResponses (df = data, 
                cols = cols6, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2- neg, 3/4- pos, 9- n/a
UGcols7 = ['USAPoliticalNewsCableTV',
           'USAPoliticalNewsSourceRadio',
           'USAPoliticalNewsSourceDailyNewspapers',
           'USAPoliticalNewsSourceWeeklyMagazines',
           'USAPoliticalNewsSourceNetworkTV',
           'USAPoliticalNewsSourceLocalTV',
           'USAPoliticalNewsSourceLateNightTV',
           'USAPoliticalNewsSourceComedyTV',
           'USAPoliticalNewsSourceSocialMedia']

cols7 = {col: col+'_Groups' for col in UGcols7}

 # For these columns, 1/2 = neg, 3/4 = pos, 9 = Never
groups = {2: 'neg', 4: 'pos', 9: 'never'}

groupResponses (df = data, 
                cols = cols7, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1- No Interest, 2/3 - LowInterest, 4/5 - High Interest

UGcols8 = ['politicalInterest']

cols8 = {col: col+'_Groups' for col in UGcols8}

 # For these columns, 1 = NoInterest, 2/3  = LowInterest
 # 4/5 = HighInterest
groups2 = {1: 'NoInterest', 3: 'LowInterest', 5: 'HighInterest'}

groupResponses (df = data, 
                cols = cols8, 
                mapperDict = groups2,
                naVal = [99],
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1- multi-player, 2- co-operative, 3- single-player

UGcols9 = ['videoGameTypeMostFrequent']

cols9 = {col: col+'_Groups' for col in UGcols9}

 # For these columns, 1 = NoInterest, 2/3  = LowInterest
 # 4/5 = HighInterest
groups2 = {1: 'multi-player', 3: 'co-operative', 5: 'single-player'}

groupResponses (df = data, 
                cols = cols9, 
                mapperDict = groups2,
                naVal = [99],
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic = 1/2- neg, 3- neutral, 4/5- post, 9- IDK

UGcols10 = ['facebookLearnOthersInterests',
            'facebookUnderstandSociety',
            'facebookDailyRoutine',
            'facebookOutOfTouchIfNotUsed',
            'facebookUpsetIfShutdown']

cols10 = {col: col+'_Groups' for col in UGcols10}

# For these columns, 1-2 = neg, 3 = neutral, 5 = pos, 9 = IDK
groups = {2: 'neg', 3: 'neutral', 5: 'pos', 9: 'IDK'}

groupResponses (df = data, 
                cols = cols10, 
                mapperDict = groups,  
                naVal = [99],  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- Never, 2- <1PerMonth, etc.


UGcols11 = ['socialMediaUseFacebook',
            'socialMediaUseTwitter',
            'socialMediaUseLinkedIn',
            'socialMediaUseYouTube',
            'socialMediaUseInstagram',
            'socialMediaUsePinterest',
            'socialMediaUseMyspace',
            'socialMediaUseGooglePlus',
            'socialMediaUseFoursquare',
            'socialMediaUseReddit',
            'videoGameUseFrequency']

cols11 = {col: col+'_Groups' for col in UGcols11}

 # For these columns, 1 = Never, 2 = <1PerMonth, 3 = 1PerMonth, 4 = 2-3MPeronth,
 # 5 = 1PerWeek , 6 = 2-3PerWeek , 7 = Daily , 8 = MultipleTimesPerDay 
    
groups = {1: 'Never', 2: '<1PerMonth', 3:'1PerMonth',
          4:'2-3PerMonth',5:'1PerWeek',6:'2-3PerWeek',7:'Daily',
          8:'MultipleTimesPerDay'}

groupResponses (df = data, 
                cols = cols11, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- NotInLastWeek, 2- <10Minutes, etc.


UGcols12 = ['facebookUseAmount',
            'twitterUseAmount']

cols12 = {col: col+'_Groups' for col in UGcols12}

 # For these columns, 1 = NotInLastWeek, 2 = <10Minutes, 3 = 10-30Minutes,
 # 4 = 31-60Minutes, 5 = 61-90Minutes , 6 = >90Minutes 
    
groups = {1: 'NotInLastWeek', 2: '<10Minutes', 3:'10-30Minutes',
          4:'31-60Minutes',5:'61-90Minutes',6:'>90Minutes'}

groupResponses (df = data, 
                cols = cols12, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1/2- LowConfidence, 3/4 - HighConfidence, 99 - IDK


UGcols13 = ['politiciansDoRight',
            'politiciansCountryBestInterests',
            'politiciansTellTruth']

cols13 = {col: col+'_Groups' for col in UGcols13}

 # For these columns, 1 = NotInLastWeek, 2 = <10Minutes, 3 = 10-30Minutes,
 # 4 = 31-60Minutes, 5 = 61-90Minutes , 6 = >90Minutes  
    
groups = {2: 'LowConfidence', 4: 'HighConfidence', 99:'IDK'}

groupResponses (df = data, 
                cols = cols13, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- EveryDay, 2- 3-4Week, etc.


UGcols14 = ['discussPoliticsFriends',
            'discussPoliticsFamily',
            'discussPoliticsOthers']

cols14 = {col: col+'_Groups' for col in UGcols14}

 # For these columns, 1 = EveryDay, 2 = 3-4PerWeek, 3 = 1-2PerWeek, 
 # 4 = 1-2PerMonth, 5 = Rarely , 6 = Never  
    
groups = {1: 'EveryDay', 2: '3-4PerWeek', 3:'1-2PerWeek',
          4:'1-2PerMonth',5:'Rarely',6:'Never'}

groupResponses (df = data, 
                cols = cols14, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- City, 2-Suburban, 3-SmallTown, 4-Rural


UGcols15 = ['USAAreaType']

cols15 = {col: col+'_Groups' for col in UGcols15}

 # For these columns, 1 = City, 2 = Suburban, 3 = SmallTown, 4 = Rural  
    
groups = {1: 'City', 2: 'Suburban', 3:'SmallTown',4:'Rural'}

groupResponses (df = data, 
                cols = cols15, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- Male, 2-Female


UGcols16 = ['gender']

cols16 = {col: col+'_Groups' for col in UGcols16}

 # For these columns, 1 = Male, 2 = Female
    
groups = {1: 'Male', 2: 'Female'}

groupResponses (df = data, 
                cols = cols16, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1 = Other, 2 = English, 99 = WontSay


UGcols17 = ['langSpokenHome']

cols17 = {col: col+'_Groups' for col in UGcols17}

 # For these columns, 1 = Other, 2 = English, 99 = WontSay
    
groups = {1: 'Other', 2: 'English', 99: 'WontSay'}

groupResponses (df = data, 
                cols = cols17, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1-Cafe/Kiosk,2-Library,3-Home, etc


UGcols18 = ['internetAccessSetting']

cols18 = {col: col+'_Groups' for col in UGcols18}

 # For these columns, 1 = Other, 2 = English, 99 = WontSay
    
groups = {1: 'Cafe/Kiosk', 2: 'Library', 3: 'Home', 4: 'OthersHome',
          5: 'Work',6:'School',7:'MobilePhone',8:'Laptop',9:'Other'}

groupResponses (df = data, 
                cols = cols18, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
###SPECIAL CASE NOTE: The following columns would normally be bool columns.
###However, many of the responses in this columns are NaN which needs to be corrected
###This is because these questions where conditional, that is they were only asked if 
###the respondent answered in the affirmative on the previous question, others it was left blank

# Selection of columns whose responses can all be grouped using the same range of values
# Grouping Logic : 1- Yes, 2-No (remainder will be NA)


UGcols19 = ['videoGameTypeMultiplayerPVP','videoGameTypeMultiplayerCoop','videoGameTypeSingleplayer',
   'groupFocusLocal','groupFocusNational','groupFocusInternational']

cols19 = {col: col+'_Groups' for col in UGcols19}

 # For these columns, Nick: 1- Yes, 2-No (remainder will be NA)
    
groups = {1: 'Yes', 2: 'No', 99: 'WontSay'}

groupResponses (df = data, 
                cols = cols19, 
                mapperDict = groups,  
                naCat = 'N/A')

In [None]:
#create lists Ungrouped column headers for removal
UnGroupedcolumns = UGcols+UGcols1+UGcols2+UGcols3+UGcols4+UGcols5+UGcols6+UGcols7+UGcols8+UGcols9+UGcols10+UGcols11+UGcols12+UGcols13+UGcols14+UGcols15+UGcols16+UGcols17+UGcols18+UGcols19
Groupedcolumns= [col+'_Groups' for col in UnGroupedcolumns]

# Part III - Slicing data into separate data frames

Now that we have all the responses, we're going to produce a few different dataframes that slice the data in different ways.  Each one has a specific role to play in our future analysis.

In [None]:
#create data frame with Grouped columns removing ungrouped columns
Groupeddata=data.drop(UnGroupedcolumns, axis=1)
#utilize Vote eligible flag to remove those under 18 from dataset
# Note that this only removes 3 observations
Groupeddata=Groupeddata[Groupeddata['eligibleToVote']==1]

In [None]:
!pip install joblib

In [None]:
import joblib

# for dumping 
joblib.dump(Groupeddata, 'Groupeddata.pkl')

# for loading
#commented out to ensure object is not overwritten
###data = joblib.load('Groupeddata.pkl')

In this section I am creating lists to eliminate unusable variables for later predictive analysis

In [None]:
##Note expand column names

#function for determining the difference between two lists
def Diff(li1, li2):
    li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2]
    return li_dif

#First,create a list of current columns in the grouped data set
currentcolumns=Groupeddata.columns.tolist()

#Second, create a list of Grouped column header names using the previously created
#Ungroupedcolumns list used to remove ungrouped columns from the origional dataset

Groupedcolumns= [col+'_Groups' for col in UnGroupedcolumns]

#Using the currentcolumns list and the Groupedcolumns list
#I will use the Diff() function to create a list of columns not contained in.
#eg a list of the current columns with no groupedcolumns

noGroupedcolumns1=Diff(currentcolumns,Groupedcolumns)

#Items created with diff function needs to be converted to a str for later use
noGroupedcolumns=[str(e) for e in noGroupedcolumns1]

#Since we are focusing on the US respondents, I'll need to remove columns
#relating to the UK and AUS.
#This list containing sub strings to search for within the headings of thos columns
foreigncolumns = ['UK','AUS']

#Isolate columns for UK and AUS for removal
foreigncolumns=[ele for ele in noGroupedcolumns for x in foreigncolumns if x in ele]

#remove UK and AUS columns
noGroupedForeigncolumns1=Diff(noGroupedcolumns,foreigncolumns)


#again, convert items to str
noGroupedForeigncolumns=[str(e) for e in noGroupedForeigncolumns1]


#After reviewing each of the remaining columns, I have marked
# the following for removal
#For context, each of these is manual entry and not suitable for predictive
#Analysis with out significant transformation and assumption about the intended response
manualentrycolumns =['politicalSocialIssue1', 'politicalSocialIssue2', 
                     'politicalSocialIssue3','USAPoliticalPartyOther',
                     'langSpokenHomeOther','USAOtherEthnicityOther',
                     'USAEducationOther','internetAccessOther',
                     'internetAccessSettingOther']

#remove manual entry/non-binary columns, leaving only binary columns
binarycolumns1=Diff(noGroupedForeigncolumns,manualentrycolumns)

#again, convert items to str
binarycolumns=[str(e) for e in binarycolumns1]

#combine remaining binary columns and grouped columns

finalcolumns=binarycolumns+Groupedcolumns





In [None]:
print(len(finalcolumns))

In [None]:
#data for One Hot Encoding
#DF contains only either binary or Grouped data for one-hot/dummy encoding
OneHotdata=Groupeddata[finalcolumns]

In [None]:
#convert categorical variables to categorical
OneHotdata[Groupedcolumns]=OneHotdata[Groupedcolumns].astype('category')

In [None]:
#Convert Binary Variables to Bool

#Age is the only non-binary variable remaining in binarycolumns list so
#will need to create a list without it

noAgebinarycolumns=['internetAccessCafeKiosk','internetAccessLibrary',
                    'internetAccessOwnHome','internetAccessOtherHome',
                    'internetAccessWork','internetAccessSchool',
                    'internetAccessMobilePhone','internetAccessLaptopTablet',
                    'internetAccessSomewhereElse','internetAccessNone',
                    'groupPoliticalIssue','groupSocialIssue',
                    'groupLocalCommunity','groupPolitican',
                    'groupElectionCampaign','USAAsianPacificIslander',
                    'USABlack','USAWhite','USAHispanic','USALatino',
                    'USAMiddleEastern','USAOtherEthnicity','eligibleToVote',
                    'Voted']


In [None]:
#convert remaining columns to bool
# Note: depending on the version of Pandas in the container, you may need to do astype('Bool') instead of astype('boolean')
# OneHotdata[noAgebinarycolumns]=OneHotdata[noAgebinarycolumns].astype('Bool')
OneHotdata[noAgebinarycolumns]=OneHotdata[noAgebinarycolumns].astype('boolean')

In [None]:
import joblib

# for dumping 
joblib.dump(OneHotdata, 'OneHotdata.pkl')

# for loading
#OneHotdata = joblib.load('OneHotdata.pkl')

In [None]:
#will need for one-hot encoding
import joblib

# for dumping 
joblib.dump([Groupedcolumns], 'Groupedcolumns.pkl')

# for loading
#OHcolumns = joblib.load('Groupedcolumns.pkl')