# Demographics

This notebook contains information about the demographics of the respondents in the survey (age, gender, ethnicity, etc).

In [None]:
# Load pre-processed data
!pip install joblib
import joblib

import plotly.express as px

import pandas as pd
import numpy as np
from math import floor, ceil, log

data = joblib.load('GroupedAndUngroupedData.pkl')

Function used to easily generate histograms

In [None]:
def ShowHistogram (data, col, sortorder='ascending', sorttype='category', **kwargs):
    """
    Function to display histograms and percentage breakdowns for a specified column in the dataframe.
    @ params:
        data                        -  dataframe 
        col                         -  column name within dataframe
        sortorder                   -  ascending or descending (default is ascending)
        sorttype                    -  sort either by category name ('category') or totals ('total'). Default is category.
        **kwargs                    -  optional additional arguments to pass into the histogram call. for example, adding
                                       facet_col = 'gender_Groups' will additionally facet the histogam by gender 
    """
    
    tickmode='array'
    ascending = sortorder == 'ascending'
    
    # If we're faceting by something, we want to also sort by the facet column so the results are displayed consistently
    if 'facet_col' in kwargs.keys():
        data = data.sort_values([col,kwargs['facet_col']], ascending=ascending)
    else:
        data = data.sort_values([col], ascending=ascending)
    
    tickvals=data[col]
    
    # Our preprocessed data has columns representing the responses and then our grouped / translated versions of those
    # responses as ""{columnName}_Groups".  However, not every column was Grouped (age, for example).
    # If the column we're generating the histogram for does have a _Groups equivalent, we'll display those values
    # in the X axis.  Otherwise, we'll just use the data as-is.
    if f'{col}_Groups' in data.columns:
        ticktext=data[f'{col}_Groups']
        vals = sorted(set(data[f'{col}_Groups']))
    else:
        ticktext=data[col]
        vals = sorted(set(data[f'{col}']))
    
    # To help ensure the colors in the histograms are consistent, this part maps each of the unique values being 
    # graphed to a color in the colors.qualitative.G10 pallette (in PlotLy).  This only works if there are 
    # 10 or fewer unique values (since the qualitative pallettes only have 10 values).  So if there are more, we just skip
    # this step.
    colorDict = {}
    
    if len (vals) <= 10:
        for val in vals:
            colorDict[val] = px.colors.qualitative.G10[len(colorDict)]
    if colorDict:
        p=px.histogram(data,x=col, color=ticktext, histnorm='density',color_discrete_map = colorDict, **kwargs)
    else:
        p=px.histogram(data,x=col, color=ticktext, histnorm='density',**kwargs)
        
    # Remove the "colname=" junk from the legends
    p=p.for_each_trace(lambda t: t.update(name=t.name.split('=')[1]))
    
    # If the responses were grouped 1:1, then we can use the raw responses on the x-axis but display their translated
    # names
    if len(set(tickvals)) == len(set(ticktext)):
        p.update_layout(xaxis=dict(tickmode=tickmode, tickvals=tickvals, ticktext=ticktext))
    
    # Remove the "colname=" junk from the facet labels
    if 'facet_col' in kwargs.keys():
        p = p.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
    
    # Sort the x axis in the specified order
    categoryorder = f'{sorttype} {sortorder}'
    p=p.update_xaxes(type='category', categoryorder=categoryorder)
    p.show()
    
    if f'{col}_Groups' in data.columns:
        print ('Grouped Columns Distribution:\n')
        print (data[f'{col}_Groups'].value_counts(normalize=True)*100)
        print ('\n')
    print ('Ungrouped Columns Distribution:\n')
    print (data[f'{col}'].value_counts(normalize=True)*100)

# Age distribution

In [None]:
col = 'age'
ShowHistogram(data, col)

We can see there are four roughly equal-sized groupings:
<ul>
    <li>~27% were aged 16-19.</li>  
    <li>~27% were aged 20-23</li>
    <li>~22% were aged 24-26</li>
    <li>~24% were aged 27-29</li>
</ul>
These groupings fortunately lend themselves to some semantic labels:
Teens, Early 20s, Mid 20s, Late 20s

In [None]:
# Here we'll create the groupings based on the above four groups we identified

bins = pd.IntervalIndex.from_tuples([(15,19), (19,23), (23,26), (26,29)])
binlabels = ['LateTeens', 'Early20s', 'Mid20s', 'Late20s']

age_Groups = pd.cut(data.age.to_list(), bins=bins)
age_Groups.categories = binlabels
data['age_Groups'] = age_Groups

data.age_Groups = data.age_Groups.astype(str)

In [None]:
# Revised age histogram showing groups:
ShowHistogram(data,col)

In [None]:
# Age groups faceted by gender:
ShowHistogram(data,col, facet_col='gender_Groups')

This shows us that the majority of female respondents were in their late teens, while the majority of male respondents
were in their mid-to-late 20s.  This may need to be factored into our multivariate analysis.

# Gender distribution

Roughly equal distribution of males vs females, slightly favoring females.

In [None]:
col = 'gender'
ShowHistogram(data, col, 'descending')
# Gender distribution of respondents.  53% / 47% female: male

# Ethnicity

The vast majority of respondents (~70%) identified as White.  Black was the next most common at ~ 11%.

In [None]:
# The ethnicity data is broken out among a few binary columns, so first we'll combine all those to form groups
ethnicity_cols = ['USAAsianPacificIslander', 'USABlack', 'USAWhite', 'USAHispanic', 'USALatino', 'USAMiddleEastern', 'USAOtherEthnicity']

data['ethnicity_Groups'] = data[ethnicity_cols].apply(lambda row: np.argmax(row).replace('USA',''), axis=1)


In [None]:
ShowHistogram (data, 'ethnicity_Groups', sorttype='total')

# Area Type (rural, small town, suburban, or city)

Most respondents live in suburban areas or cities.

In [None]:
# This question asked respondents which of the following best describes the area in which you live
col = 'USAAreaType'

ShowHistogram(data, col, sorttype='total')

# Education level

Over half (~55%) of respondents either completed a 4-year college or have some college education.

In [None]:
col = 'USAEducation'
ShowHistogram(data,col)

# Language spoken at home

Perhaps unsurprisingly, the vast majority of respondents (~85%) speak English at home.

In [None]:
col = 'langSpokenHome'
ShowHistogram(data, col, 'descending')

# Access to the Internet

Respondents were asked in which setting they most frequently accessed the internet.

The majority (~ 63%) indicated that their home was the most common setting.

In [None]:
col = 'internetAccessSetting'
ShowHistogram(data, col, sorttype='total')

Respondents were additionally asked which setting(s) they accessed the internet from in the past week.  These responses
seem to echo the most frequent settings.

In [None]:
# These questions were translated into a series of binaries, and respondents could indicate multiple settings.  
# We'll total up all of the responses for each possible setting.
internetAccess_cols = ['internetAccessCafeKiosk',
'internetAccessLibrary',
'internetAccessOwnHome',
'internetAccessOtherHome',
'internetAccessWork',
'internetAccessSchool',
'internetAccessMobilePhone',
'internetAccessLaptopTablet',
'internetAccessSomewhereElse',
'internetAccessNone']

dic = {col: data[col].sum() for col in internetAccess_cols}

iAccessData = pd.DataFrame.from_dict(dic, orient='index', columns=['Total'])
iAccessData['PastWeekAccessLocation'] = iAccessData.index

In [None]:
p = px.bar (iAccessData, x='PastWeekAccessLocation', y='Total' )
p=p.update_xaxes(type='category', categoryorder='total ascending')
p.show()