# Mike Babb
# babbm@uw.edu
# January 22, 2019
# CSSCR, Introduction to Python

## Jupyter Notebook Tips

In [None]:
# command mode: blue highlight
# edit mode: green highlight
# press H in command mode to bring up the keyboard short cut menu
# press "ctrl+enter" at the same time to run the cell
# press the "down arrow" key to go to the cell below
# press the "up arrow" key to go to the cell above
# press "enter" to activate a cell


# comments begin with a "#" character.

## hello world

In [None]:
#'Hello world' example

In [None]:
print('hello world')

## working with numbers

In [None]:
# numbers: integers (whole numbers, no decimals)
x = 1

In [None]:
x

In [None]:
# numbers: floats (floating point numbers, with decimals)
x = 10.0

In [None]:
x

In [None]:
# addition
y = 3

In [None]:
z = x + y

In [None]:
z

In [None]:
# division
x / 2

In [None]:
x

## working with strings

In [None]:
# strings: (alpha-numeric characters surrounded by single or double quotation marks)
x = 'geography'

In [None]:
y = "geography"

In [None]:
# test for equality
x==y

In [None]:
# python is case sensitive for variable values and variable names
y = 'Geography'

In [None]:
x==y

In [None]:
Y='geography'

In [None]:
y==Y

In [None]:
# can we add strings?
z = x + y

In [None]:
# yes...
z

In [None]:
# strings have many properties and methods!
# please see this page for more information
# https://docs.python.org/3/library/stdtypes.html

In [None]:
# For example, determining the number of characters in the string. This is called the length of the string.
# we do this by calling the len() function.
len(x)

In [None]:
# In some sense, strings can be thought of as a list of characters
# we can create a list of characters by the entering the following
string_list = list(x)

In [None]:
string_list

In [None]:
# calling the list function breaks up the string!
# we can call the same len() function to determine the length of our list.
# It's the same as above
len(string_list)

In [None]:
# this indicates we have a list nine items long

In [None]:
# an easy to tell what type of object we are working with
type(string_list)

In [None]:
type(x)

In [None]:
# we can print each character in our list if we want
for character in string_list:
    print(character)

In [None]:
# or access individual items in our list

In [None]:
string_list[0] # the zeroeth character. notice the square brackets

In [None]:
string_list[-1] # the last character

In [None]:
string_list[8] # same as above

## working with data

In [None]:
# So far we've entered data and values directly into a Jupyter Notbook cell. 
# What if we want to read data that exist elsewhere?
# let's read in data pertaining to sex by age groups for places in Washington state during the 2013-2017 time period.
# These data come from the ACS and were downloaded from American Fact Finder
# https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml

# we're working with Census Designated Places
# https://www.census.gov/geo/reference/gtc/gtc_place.html

In [None]:
# we'll need three python libraries to help with this: os, pandas, and numpy

In [None]:
import os # operating system
import pandas as pd # pd is just an alias for pandas
import numpy as np # same with np

# specify your netid

In [None]:
netid = 'babbm'

In [None]:
# First, we're going to create a string varible with the value of the directory we're working with

file_path = os.path.join('C:/users', netid, '/intro_to_python-master/intro_to_python/aff_download')

In [None]:
# let's check it
file_path

In [None]:
# specify the name of the file
file_name = 'ACS_17_5YR_S0101_with_ann.csv'

In [None]:
# join the file path and file name together
file_path_name = os.path.join(file_path, file_name)
file_path_name

### reading data using python's native file methods.

In [None]:
# we're going to read our data by creating a file object with the following command
data_file = open(file_path_name, 'r')

In [None]:
# let's print each line of our file
for line in data_file:
    print(line)

In [None]:
# lot's of data.
# let's close our file, for now
data_file.close()

### reading data using pandas

In [None]:
# we know our data are rectangular and parsing that information by hand is going to take too much time
# pandas to the rescue!
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
# the dtype=np.str is telling pandas to read all of our data in as strings
# this is necessary due to how AFF data are delivered to the end user. 
df = pd.read_csv(filepath_or_buffer=file_path_name, sep=',', header=0, dtype=np.str)

In [None]:
df.head()

In [None]:
# there is a lot going on here and it looks like the first line in our data file features the 
# stats software friendly names of columns the and second line features descriptive names
# This information is repeated in the ACS_17_5YR_S0101_metadata.csv file as well.
# Let's view that in a separate notebook

In [None]:
# we'll access our data by integer location
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.iloc.html
# dataframes are zero-indexed. 
# we'll get lines 1 through the end
df = df.iloc[1:]

In [None]:
df.head()

### selecting and renaming columns

In [None]:
# 459 columns. that's a lot.
# because we are working with survey data, we have the estimate (est), and the margin of error (moe)
# The metadata describes each one. 

In [None]:
# it looks like we have a combination of upper and lower case column names.
# let's convert everything to lower case to make things easier to work with

In [None]:
# get our column names
col_names = df.columns.tolist()

In [None]:
col_names

In [None]:
# let's use a loop and during each pass through we'll convert the names to lower case and append items to our list
new_col_names = [] # empty list to hold our output
for cn in col_names:
    new_cn = cn.lower()
    new_col_names.append(new_cn)

In [None]:
new_col_names

In [None]:
# better.
# another way to make lower case names is through list comprehension.
# https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
# one of python's more effective techniques
# while the outcome is equivalent, it's often more tidy (and frequently faster)
new_col_names = [cn.lower() for cn in col_names]

In [None]:
df.columns = new_col_names

In [None]:
df.head()

In [None]:
# We're producing the sex ratio.
# We probably do not need all 459 columns.
# Let's consult our metadata document to see if we can trim down our list of variables

In [None]:
# let's keep the following:
# place identification variables, total population, total males, and total females
col_names = ['geo.id', 'geo.id2', 'geo.display-label', 'hc01_est_vc01','hc03_est_vc01','hc05_est_vc01']

In [None]:
df = df[col_names]

In [None]:
df.head()

In [None]:
# let's rename some of our columns to help with this
# we'll use a python dictionary to store how we will rename things
# https://docs.python.org/3/tutorial/datastructures.html#dictionaries
# other programming languages call these objects 'hash tables' or 'associative arrays'
# https://en.wikipedia.org/wiki/Associative_array
rename_dictionary = {'hc01_est_vc01':'total_pop', 'hc03_est_vc01':'males', 'hc05_est_vc01':'females'}

In [None]:
# specify the input value and the dictionary will tell you the associated value/object
rename_dictionary['hc01_est_vc01']

In [None]:
# rename
df = df.rename(columns=rename_dictionary)

In [None]:
df.head()

In [None]:
# much better. 

### computing basic statistics

In [None]:
# how many people in Washington are there in total?
df['total_pop'].sum()

In [None]:
# we just concatenated all of the strings together!
# we need to change the data type first
df['total_pop'] = df['total_pop'].astype(np.int32) # specify a 32-bit integer

In [None]:
df['total_pop'].sum()

In [None]:
# there we go...
# what are some summary statistics at the place level?
df['total_pop'].describe()

In [None]:
# how many males in total?
df['males'] = df['males'].astype(np.int32)
df['males'].sum()

In [None]:
# how many females?
df['females'] = df['females'].astype(np.int32)
df['females'].sum()

In [None]:
# more females than males.

### computing the all ages sex ratio

In [None]:
# let's compute the sex ratio: the number of males per 100 females
df['sex_ratio'] = (df['males'] / df['females']) * 100

In [None]:
df['sex_ratio'].describe()

In [None]:
# the inf and NAN values usually indicate a division by zero error. Are there places with zero females?

In [None]:
df.head()

### identifying outliers

In [None]:
# back to our descriptions
df['males'].describe()

In [None]:
df['females'].describe()

In [None]:
df['total_pop'].describe()

In [None]:
# and it looks like there are towns with zero people
df[df['total_pop']==0].head()

In [None]:
# are they the same as the towns with zero males?
df[df['males']==0].head()

In [None]:
# zero females
df[df['females']==0].head()

In [None]:
# let's check by removing the places with zero population 
df = df[df['total_pop']>0]

In [None]:
# you'll notice that the margin of error is quite large in all of these areas.
# so large the estimates are probably not valid
# should we remove? 
# should we flag?
# let's keep these data for now.

### applying a user a defined function

In [None]:
# but we can't compute the sex ratio as we did above.
# to compute the ratios, we'll need to write a function that examines each row in our pandas dataframe
# up until now, we've been operating on columns (vectors) of our data. We're now going to examine rows. 

# we're now going to apply our function to reach row in our dataframe
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html

In [None]:
def sex_ratio(row):
    males = row['males'] # the current row's 'males' value
    females = row['females'] # current row's 'females' value
    # use an if statement to account for places with 0 females surveyed.
    if females == 0:
        # we'll add 1 to the count, in this case.
        outcome = (males / (females + 1)) * 100
    else:
        outcome = (males / females) * 100
    
    return outcome

In [None]:
df['sex_ratio'] = df.apply(func=sex_ratio, axis=1)

In [None]:
df['sex_ratio'].describe()

In [None]:
# what is going with the max value?
df.sort_values(by='sex_ratio', ascending=False).head()

In [None]:
# it looks like towns with zero females feature a very high sex ratio.
# That's not surprising, given what we've done. 

In [None]:
# let's say we want to look at the sex ratio by quintiles of population size
# maybe there is something systematic going on in smaller areas?
# https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.qcut.html
df['pop_quintile'] = pd.qcut(x=df['total_pop'], q=5, labels=False)

In [None]:
df.head()

In [None]:
# let's see our labels for our quintiles
df['pop_quintile'].unique()

In [None]:
# python is zero-indexed...
for i in range(0, 5):
    # apply string formatting to produce a meaningful descriptor
    curr_quintile = (i + 1) * .2
    curr_quintile = '{:.0%}'.format(curr_quintile)
    
    # let's look at the current quintile
    curr_df = df[df['pop_quintile']==i]
    
    print('****TOTAL POPULATION****')
    print('Current quintile:', curr_quintile)
    print(curr_df['sex_ratio'].describe())   
    

In [None]:
# it does look like there is something odd about the smaller places based on the summary statistics
# but let's visualize these data

### plotting data

In [None]:
# let's plot this using the matplot lib library so we can see it more easily
# https://matplotlib.org/api/_as_gen/matplotlib.pyplot.boxplot.html#matplotlib.pyplot.boxplot
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
boxplot = df.boxplot(column=['sex_ratio'], by=['pop_quintile'])

In [None]:
# the sex ratio of the smaller areas is so large that it's affecting the 
# scale of the box plot.
# let's remove the areas with zero females, recompute our quintiles, and redo our box plots

In [None]:
# remove areas with zero females using .loc notation:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.loc.html
df = df.loc[df['females']>0, :]

In [None]:
# recompute quintiles
df['pop_quintile'] = pd.qcut(x=df['total_pop'], q=5, labels=False)

In [None]:
# another box plot
boxplot = df.boxplot(column=['sex_ratio'], by=['pop_quintile'])

In [None]:
# that looks more reasonable

In [None]:
# compute deciles
df['pop_decile'] = pd.qcut(x=df['total_pop'], q=10, labels=False)

In [None]:
# another box plot
boxplot = df.boxplot(column=['sex_ratio'], by=['pop_decile'])

### exporting data

In [None]:
# let's write these data out to a csv and an excel file
# first, to the csv
file_name = 'wa_places_sex_ratios.csv'
file_path_name = os.path.join(file_path, file_name)

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
df.to_csv(path_or_buf=file_path_name, sep='\t', index=False)

In [None]:
# now, to excel
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html
file_name = 'wa_places_sex_ratios.xlsx' # notice the .xlsx extension
file_path_name = os.path.join(file_path, file_name)
df.to_excel(excel_writer=file_path_name, index=False)

In [None]:
# With respect to the data, across all age groups, it looks like most areas in Washington
# feature a sex ratio around 100, as is usually the case.
# However, there are more areas that feature more males than females.