In [None]:
import math
# Import pandas with alias
import pandas as pd

# Read in the census dataframe
census = pd.read_csv('census_data.csv', index_col=0)

In [None]:
# Steps 1 & 2
print(census.head())

In [None]:
# Step 3
print(census.dtypes)

In [None]:
# Step 4 
print(census['birth_year'].nunique())
print(census['birth_year'].unique())
print(pd.crosstab(index=census['birth_year'], columns='count'))

In [None]:
# Step 5
# There appears to be a missing value in the birth_year column. With some research you find that the respondent’s birth year is 1967.
census['birth_year'] = census['birth_year'].replace(['missing'], '1967')
print(census['birth_year'].unique())

In [None]:
# Step 6
# Change data type
census['birth_year'] = census['birth_year'].astype('int')
print(census['birth_year'].dtype)

In [None]:
# Step 7
print(math.floor(census['birth_year'].mean()))

In [None]:
# Step 8 
census['higher_tax'] = pd.Categorical(census['higher_tax'], ['strongly disagree', 'disagree','neutral', 'agree', 'strongly agree'], ordered=True)
print(census['higher_tax'].dtype)
print(census['higher_tax'].unique())

In [None]:
# Step 9
census['higher_tax_codes'] = census['higher_tax'].cat.codes
print(census['higher_tax_codes'].median()) 
print(census['higher_tax_codes'].dtype)
print(pd.crosstab(index=census['higher_tax_codes'], columns='count'))
# obviously the median of a Likert scale type question like that is neutral... 

In [None]:
# making a duplicate census['marital_status'] column for census because Step 11 asks for use of census['marital_status'], which no longer exists after OHE
census['marital_status_copy'] = census['marital_status']

In [None]:
# Step 10 
census = pd.get_dummies(data=census, columns=['marital_status'])
print(census.head())
print(census.dtypes)

In [None]:
# Step 11 census['marital_codes']
print(pd.crosstab(index=census['marital_status_copy'], columns='count'))
census['marital_status_copy'] = pd.Categorical(census['marital_status_copy'], ['single', 'married', 'divorced', 'widowed'], ordered=False)
print(census['marital_status_copy'].dtypes)
print(census['marital_status_copy'].unique)
census['marital_codes'] = census['marital_status_copy'].cat.codes
print(pd.crosstab(index=census['marital_codes'], columns='count'))
print(census.head())

In [None]:
# Step 11 census['age_group']
#Create a new variable called age_group, which groups respondents based on their birth year. The groups should be in five-year increments, e.g., 25-30, 31-35, etc.
# so they did not provide an age variable in this dataset. Only a birth year. I am assuming this data is fictitious, so I don't know what year this fictitious data was collected. Technically, for a survey like this, age should be the age at time of data collection. But since I don't know the date of data collection, I will have to calculate the age based on the current year, 2021. 
census['age'] = 2021 - census['birth_year']
print(census.head())
print(pd.crosstab(index=census['age'], columns='count'))
# min age = 14 max age = 81
print(census['age'].min()) # 14
print(census['age'].max()) # 81
# Create bins
bins = [x for x in range(10, 90, 5)]
print(bins)
census['binned_age'] = pd.cut(census['age'], bins)
print(census['binned_age'].dtypes)
print(census[['binned_age', 'age']].head())
print(len(census['age']))
print(pd.crosstab(index=census['binned_age'], columns='count'))
print(census.head())
#Then label encode the age_group variable to assist the Census team in the event they would like to use machine learning to predict if a respondent thinks the wealthy should pay higher taxes based on their age group.
census['age_group'] = census['binned_age']
print(census['age_group'].dtypes)
print(census['age_group'].unique())
census['age_group'] = census['age_group'].cat.codes
print(census.head())