# First things first

It is programming tradition to start with a "Hello world!" program.

In [1]:
print('Hello world!')

Hello world!


Congratulations, you've written your first program. 

This program consists of a single **command**, `print('Hello world!')`. A command tells the computer what to do. The `print` command causes Python to display something 

In [2]:
# These lines are comments. The Python interpreter ignores them.
# Comments are just notes for humans to read to help understand the code.
# Best practice: add a comment for every couple lines of code to explain what's going on and why.
# You'd be amazed at how quickly you forget your code's logic

# Basic math

In [3]:
# addition
5 + 2

7

In [4]:
# subtraction
5 - 2

3

In [5]:
# multiplication
5 * 2

10

In [6]:
# division
5 / 2

# Note that this is one of the major difference between Python 2 and 3!
# In Python 3, this performs as we'd expect, and returns 2.5.
# But in Python 2, the / operator performs integer division and only returns 
# the whole number portion of the result when you divide two integers.
# (In Python 3, integer division uses the // operator.)

2

In [7]:
# test for equality
5 == 2

False

In [8]:
# exponentiation: raising 5 to the 2nd power
print(5 ** 2)

# the modulus operator: what is the remainder when you divide 5 by 2?
print(5 % 2)

25
1


# Variables and print commands

In [9]:
# variables, such as x here, contain values and their values can vary
x = 5

In [10]:
# what is the value of x?
print(x)

5


In [11]:
# you can perform operations on variables, just like you can on two numbers
print(x + 3)

8


In [12]:
# what is the value of x now?
print(x)

5


In [13]:
# to update the value of a variable, you need to do an assignment again
x = x + 3

In [14]:
# and now what is the value of x?
print(x)

8


In [15]:
# create a new variable y from an operation on x
x = 5
y = x * 2
print(y)

10


In [16]:
# outputting values only displays the last thing output
print(x)
print(y)

5
10


In [17]:
# you can comma-separate values to print multiple to the console on one line
print(x, y)

(5, 10)


In [18]:
# you can also print the result of an expression
print (x * y)

50


In [19]:
# Naming conventions: you can't have spaces or dashes in variable names
# Also, variable names must begin with a letter (but after that, numbers are fine)
# Underscores _ and Mixed Case is fine
# Variable names (like the rest of Python) are case sensitive
# Stylistically, variable names should begin with a lowercase letter
# names_with_underscores are one good approach
# namesWithCapitals are another

my_var = 1
my_var2 = 2
anotherVar = 3
# this-aint-a-var = 4

# Data types

In [20]:
# integers are whole numbers
type(125)

int

In [21]:
# every variable has a data type, and they can be of any type
x = 125
type(x)

int

In [22]:
# float is a floating point (aka decimal) number
some_rate = 4.3
type(some_rate)

float

In [23]:
# strings are strings of characters
my_string = 'abc'
type(my_string)

str

In [24]:
# a list is a collection of elements denoted by square brackets
my_list = [1, 2, 3, 4]
print(my_list)
type(my_list)

[1, 2, 3, 4]


list

In [96]:
# a dictionary is a collection of key:value pairs, denoted by curly braces
person = {'first_name':'Carolina', 'last_name':'Reid'}
print(person)
type(person)

{'first_name': 'Carolina', 'last_name': 'Reid'}


dict

In [97]:
person['first_name']

'Carolina'

# String processing

In [26]:
# some of the operators we saw earlier work on strings
city = 'Berkeley'
sep = ', '
state = 'California'
zip_code = '94703'

location = city + sep + state + ' ' + zip_code
print(location)

Berkeley, California 94703


In [27]:
# multiplying a string just duplicates it
zip_code * 3

# division, exponentiation, and modulus don't work with strings.

'947039470394703'

In [28]:
# how many characters are in this string?
len(location)

26

In [29]:
# get a substring from some position up to but not including a second position
location[2:5]

'rke'

In [30]:
# get the first n characters from the string
location[:5]

'Berke'

In [31]:
# get the characters from the string after the nth position
location[5:]

'ley, California 94703'

In [32]:
# you can replace characters in a string with the replace() method
location.replace('e', 'E')

# We'll look at methods a lot more later on.
# We don't have enough time to really get into it, but Python string formatting
# is incredibly flexible and powerful and merits further investigation.

'BErkElEy, California 94703'

# Intro to Pandas

In [39]:
# To use a package, you have to import it

import pandas as pd

In [40]:
# Use the read_csv method to load a csv file into a DataFrame

places = pd.read_csv('survey_place.csv')

A little explanation stolen from Geoff Boeing's notebook at https://github.com/gboeing/urban-data-science/blob/master/07-Intro-to-Pandas/intro-to-pandas.ipynb:

> pandas is designed to make it easier to work with structured data. Most of the analyses you might perform will likely involve using tabular data, e.g., from .csv files or relational databases (e.g., SQL). The DataFrame object in pandas is "a two-dimensional tabular, column-oriented data structure with both row and column labels."

In [41]:
# In a notebook, you can run the name of the DataFrame to look at it:
places

# Try running the head() method to look at the first five rows. 
# You can also try head(10) to look at the first 10 rows
# places.head()

Unnamed: 0,sampno,perno,plano,vehno,tripno,place_name,travel_date,arr_time,dep_time,mode,...,hhmem,lon,lat,non_hh_members,route,per1,per2,per3,per4,per5
0,1031985,1,1,,,redacted,5/1/2012,3:00:00,8:00:00,,...,,redacted,redacted,,,,,,,
1,1031985,1,2,97.0,1.0,redacted,5/1/2012,9:00:00,12:00:00,6.0,...,0.0,redacted,redacted,1.0,,,,,,
2,1031985,1,3,97.0,2.0,redacted,5/1/2012,13:00:00,2:00:00,6.0,...,0.0,redacted,redacted,1.0,,,,,,
3,1031985,2,1,,,redacted,5/1/2012,3:00:00,9:00:00,,...,,redacted,redacted,,,,,,,
4,1031985,2,2,1.0,1.0,redacted,5/1/2012,9:00:00,10:00:00,5.0,...,0.0,redacted,redacted,0.0,,,,,,
5,1031985,2,3,1.0,2.0,redacted,5/1/2012,10:00:00,2:00:00,5.0,...,0.0,redacted,redacted,0.0,,,,,,
6,1032036,1,1,,,redacted,5/12/2012,3:00:00,8:00:00,,...,,redacted,redacted,,,,,,,
7,1032036,1,2,1.0,1.0,redacted,5/12/2012,8:00:00,15:00:00,5.0,...,2.0,redacted,redacted,0.0,,4.0,5.0,,,
8,1032036,1,3,1.0,2.0,redacted,5/12/2012,15:00:00,16:00:00,5.0,...,4.0,redacted,redacted,0.0,,2.0,3.0,4.0,5.0,
9,1032036,1,4,1.0,3.0,redacted,5/12/2012,16:00:00,17:00:00,5.0,...,1.0,redacted,redacted,0.0,,2.0,,,,


In [42]:
# How many rows? 
print len(places)

# Also try the .shape attribute to see how many rows and columns there are
# places.shape

460524


In [43]:
# There are a ton of columns here! Can we subset to ones we're interested in?

columns_of_interest = ['sampno','perno','plano','tottr','hhmem','mode','vehno','trip_distance_miles']

Subset the DataFrame by passing a list:

In [None]:
places[columns_of_interest]

# Equivalent to the below
# places[['sampno','perno','plano','tottr','hhmem','mode','vehno','trip_distance_miles']]

In [44]:
# We did not actually assign the new DataFrame to any variable name above. 

places_subset = places[columns_of_interest]

In [45]:
places_subset.head()

Unnamed: 0,sampno,perno,plano,tottr,hhmem,mode,vehno,trip_distance_miles
0,1031985,1,1,,,,,
1,1031985,1,2,2.0,0.0,6.0,97.0,13.428271
2,1031985,1,3,2.0,0.0,6.0,97.0,12.975526
3,1031985,2,1,,,,,
4,1031985,2,2,1.0,0.0,5.0,1.0,5.12596


Two ways to look at one column:

In [49]:
# Dot notation
places_subset.trip_distance_miles.head()

0          NaN
1    13.428271
2    12.975526
3          NaN
4     5.125960
Name: trip_distance_miles, dtype: float64

In [50]:
# Bracket notation
places_subset['trip_distance_miles'].head()

0          NaN
1    13.428271
2    12.975526
3          NaN
4     5.125960
Name: trip_distance_miles, dtype: float64

### Differences in trip distances

Is the average trip taken by Oakland residents longer or shorter than the average trip taken by Berkeley residents?

*Note: This is an oversimplified analysis for example purposes that doesn't take into account weights, trip purposes or modes, etc.*

In [52]:
# We have trip distance data, but not home city

places_subset.head()

Unnamed: 0,sampno,perno,plano,tottr,hhmem,mode,vehno,trip_distance_miles
0,1031985,1,1,,,,,
1,1031985,1,2,2.0,0.0,6.0,97.0,13.428271
2,1031985,1,3,2.0,0.0,6.0,97.0,12.975526
3,1031985,2,1,,,,,
4,1031985,2,2,1.0,0.0,5.0,1.0,5.12596


In [46]:
# Let's pull in the households data as well
# This time I use some of the optional parameters in the read_csv method to grab certain columns
# Also set a certain column as the index

households = pd.read_csv('survey_households.csv', 
                         index_col='sampno', 
                         usecols=['sampno','home_city','home_county_id','mpo_sample'])

In [47]:
households.head()

Unnamed: 0_level_0,home_county_id,home_city,mpo_sample
sampno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2973777,53.0,SALINAS,3.0
2973814,53.0,SALINAS,3.0
2973902,53.0,SALINAS,3.0
2974182,53.0,SALINAS,3.0
2976556,53.0,SALINAS,3.0


In [55]:
# Let's merge these two DataFrames using the sampno variable as the key to join on

places_subset = places_subset.merge(right=households, how='left', left_on='sampno', right_index='True')
places_subset.head()

Unnamed: 0,sampno,perno,plano,tottr,hhmem,mode,vehno,trip_distance_miles,home_county_id,home_city,mpo_sample
0,1031985,1,1,,,,,,95.0,VALLEJO,22.0
1,1031985,1,2,2.0,0.0,6.0,97.0,13.428271,95.0,VALLEJO,22.0
2,1031985,1,3,2.0,0.0,6.0,97.0,12.975526,95.0,VALLEJO,22.0
3,1031985,2,1,,,,,,95.0,VALLEJO,22.0
4,1031985,2,2,1.0,0.0,5.0,1.0,5.12596,95.0,VALLEJO,22.0


In [62]:
# For the t-test later on, we need to drop rows with NaN (not a number)

places_subset = places_subset.dropna()

In [63]:
# Use the value_counts() method to look at unique values and counts:

places_subset.home_city.value_counts()

LOS ANGELES               10399
SAN JOSE                   7864
BAKERSFIELD                5795
SAN DIEGO                  4568
SAN FRANCISCO              4446
FRESNO                     4358
SACRAMENTO                 3003
RIVERSIDE                  2731
SALINAS                    2673
OAKLAND                    2397
STOCKTON                   2357
SANTA ROSA                 2246
LONG BEACH                 2152
VISALIA                    2105
SUNNYVALE                  1771
SANTA CRUZ                 1737
HOLLISTER                  1672
PALO ALTO                  1671
VENTURA                    1644
MODESTO                    1625
PASADENA                   1594
NAPA                       1538
FREMONT                    1509
CLOVIS                     1447
IRVINE                     1419
MERCED                     1417
SAN MATEO                  1415
RANCHO CUCAMONGA           1411
HUNTINGTON BEACH           1403
SIMI VALLEY                1393
                          ...  
CASPAR  

In [73]:
# Use a conditional statement within brackets to subset

places_subset[places_subset.home_city == 'LOS ANGELES'].head()

# places_subset[places_subset['mode'] == 6.0].head()

Unnamed: 0,sampno,perno,plano,tottr,hhmem,mode,vehno,trip_distance_miles,home_county_id,home_city,mpo_sample
242,1038428,1,8,3.0,1.0,6.0,97.0,2.9842,37.0,LOS ANGELES,30.0
243,1038428,1,9,3.0,1.0,6.0,97.0,3.359935,37.0,LOS ANGELES,30.0
266,1038428,4,8,3.0,1.0,6.0,97.0,2.9842,37.0,LOS ANGELES,30.0
267,1038428,4,9,3.0,1.0,6.0,97.0,3.359935,37.0,LOS ANGELES,30.0
1259,1061035,1,2,1.0,0.0,5.0,1.0,6.571637,37.0,LOS ANGELES,30.0


In [75]:
# ON YOUR OWN:
# Create two DataFrames, one for Berkeley, and one for Oakland.
# Use the variables names berkeley and oakland.

berkeley = places_subset[places_subset.home_city == 'BERKELEY']
oakland = places_subset[places_subset.home_city == 'OAKLAND']

In [76]:
print(len(berkeley), len(oakland))

(1376, 2397)


In [82]:
oakland.describe()

Unnamed: 0,sampno,perno,plano,tottr,hhmem,mode,vehno,trip_distance_miles,home_county_id,mpo_sample
count,2397.0,2397.0,2397.0,2397.0,2397.0,2397.0,2397.0,2397.0,2397.0,2397.0
mean,3023083.0,1.896537,4.761786,1.838965,0.605757,5.322069,13.736754,7.234705,1.0,22.0
std,2112287.0,1.140091,2.769451,1.036045,0.871655,0.559246,32.15999,18.264537,0.0,0.0
min,1045607.0,1.0,2.0,1.0,0.0,5.0,1.0,0.0,1.0,22.0
25%,1478684.0,1.0,3.0,1.0,0.0,5.0,1.0,1.284838,1.0,22.0
50%,1976318.0,2.0,4.0,2.0,0.0,5.0,1.0,3.240128,1.0,22.0
75%,3007038.0,2.0,6.0,2.0,1.0,6.0,2.0,7.389573,1.0,22.0
max,7210379.0,8.0,18.0,8.0,4.0,10.0,97.0,356.622087,1.0,22.0


In [83]:
oakland.trip_distance_miles.mean()

7.234705092198568

In [84]:
berkeley.trip_distance_miles.mean()

6.807609148982564

In [78]:
import scipy

In [81]:
scipy.stats.ttest_ind(a=oakland.trip_distance_miles, b=berkeley.trip_distance_miles)

Ttest_indResult(statistic=0.62227516674533823, pvalue=0.53379856993906127)

In [94]:
# We can write a more automated way to test between multiple cities

city1 = 'SACRAMENTO'
city2 = 'LOS ANGELES'

city1_df = places_subset[places_subset.home_city == city1]
city2_df = places_subset[places_subset.home_city == city2]

dist1 = city1_df.trip_distance_miles
dist2 = city2_df.trip_distance_miles

ttest_results = scipy.stats.ttest_ind(dist1, dist2)

print 'Mean for ' + city1 + ': ' + str(dist1.mean())
print 'Mean for ' + city2 + ': ' + str(dist2.mean())

print 't-statistic: ' + str(ttest_results[0])
print 'p-value: ' + str(ttest_results[1])

Mean for SACRAMENTO: 9.96343377423
Mean for LOS ANGELES: 8.03185438436
t-statistic: 4.43608637271
p-value: 9.23392335208e-06


In [98]:
# Or more simply, in a function:

def city_ttest(city1, city2, df):
    
    dist1 = df[df.home_city == city1].trip_distance_miles
    dist2 = df[df.home_city == city2].trip_distance_miles
    
    ttest_results = scipy.stats.ttest_ind(dist1, dist2)

    print 'Mean for ' + city1 + ': ' + str(dist1.mean())
    print 'Mean for ' + city2 + ': ' + str(dist2.mean())
    print 't-statistic: ' + str(ttest_results[0])
    print 'p-value: ' + str(ttest_results[1])

In [99]:
city_ttest('SACRAMENTO', 'DAVIS', places_subset)

Mean for SACRAMENTO: 9.96343377423
Mean for DAVIS: 10.3866648246
t-statistic: -0.457688918361
p-value: 0.647202124896


In [110]:
city_ttest('SAN FRANCISCO', 'LOS ANGELES', places_subset)

Mean for SAN FRANCISCO: 6.69139063225
Mean for LOS ANGELES: 8.03185438436
t-statistic: -4.00272203605
p-value: 6.29222110822e-05


In [109]:
city_ttest('BERKELEY', 'WALNUT CREEK', places_subset)

Mean for BERKELEY: 6.80760914898
Mean for WALNUT CREEK: 7.32191689069
t-statistic: -0.653490225456
p-value: 0.513499323521


# Python "Control Flow" or "Programming Flow" tools
## i.e. "if" statements, "for" loops, and functions

Now we're going to learn some simple tools to help you give Python some more complicated directions about how to run different commands.

### Loops

This is how to tell Python to run something over and over. There are different types of loops but we'll focus on "for" loops. A simple structure you can use is:

1. For every element...
2. In a certain "iterable" (e.g. list)...
3. Do a certain thing
4. Repeat until you've gone through every item in the iterable

In [None]:
# Example: we're going to run through this list below.

bart = ["Berkeley", "Ashby", "MacArthur", "19th Street", "12th Street", "West Oakland"]
# side note: spaces after commas and single vs double quotes doesn't matter
# i.e. ['Berkeley', 'Ashby'] == ["Berkeley","Ashby"]

In [None]:
# Here's the simple structure above in python syntax

# Steps 1 and 2
# syntax is:  for "x" in "y"
# x is an arbitrary name that will refer to the current iteration
# y is the list you're iterating on 
# statements that are within the loop must be indented! Indentation is key to Python syntax

for station in bart:
    
    # Step 3: what you do for each item in the list
    # Here we will just print
    print(station)
    
    # Step 4: this structure will automatically repeat! 
    
print('How many times will this print?')

Let's break this down. What happened here is:

First iteration: 
* The variable "station" is set to the first item in the list "bart", which happens to be a string, "Berkeley"
* Printing "station" therefore gives you an output of "Berkeley"

Second iteration:
* The variable "station" now equals the second item in the list, the string "Ashby", so printing "station" now outputs "Ashby"

And this continues until you hit West Oakland, at which the loop sees no more items and the loop ends.

The last line, which is not indented, is not part of the for loop, so it gets printed only once.

In [None]:
# the variable "station" above is arbitrary; you can change it to whatever you want as long as it is consistent within the loop
# you'll often see people use "i" as convention

for i in bart:
    print(i)

In [None]:
# You can set more complicated commands as well.

for i in bart:
    print("Now I am at", i, "Station")

In [None]:
# You can also set multiple variables to change within loops

for i in bart:
    before = "Now I am at "
    after = " Station"
    print(before + i + after)

In [None]:
# Notice the difference between using , and + in the two cells above
# , puts spaces between each piece (or "argument"), and can display many different data types side by side
# + *concatenates* strings, as we've seen before. This means that (1) there will be no extra space and
# (2) all the components of the concatenation must be strings.
# It is possible to convert one data type to another using str(), float(), int()

print('I am at station #', 2, 'now')
print() # blank line
print('I am at station #' + str(2) + 'now')
print('I am at station #', 2, ' now', sep = '') # we can define sep (separator) as an empty string to remove the spaces
print('I am at station #' + 2 + 'now') # throws an error

### if statements

You can also have Python run commands based on conditions: do x if y. 

In [None]:
# Simple structure:

warriors_wins = 70
bulls_record = 72

if warriors_wins < bulls_record:
    print("Warriors did not break the record :(")

In [None]:
# What happens if the condition is not met here?

warriors_wins = 73

if warriors_wins < bulls_record:
    print("Warriors did not break the record :(")

In [None]:
# if can set "else" statements for when the condition is not met

if warriors_wins < bulls_record:
    print("Warriors did not break the record :(")
else:
    print("Warriors tied or broke the record!")

In [None]:
# you can set more than 2 conditions using "elif" which is short for "else if"

warriors_wins = 73

if warriors_wins < bulls_record:
    print("Warriors did not break the record :(")
elif warriors_wins == bulls_record:
    print("Warriors tied the record.")
else:
    print("Warriors broke the record!")

### Functions

So you've written a useful piece of code, and you're using it multiple times but finding it tedious to keep copying and pasting it. This is where **functions** come in. A function is basically a chunk of code that is saved under a name, and takes one or more inputs that are used in the chunk of code. Here's an example.

In [None]:
# Create a function to save the warriors code above

# "def" creates a function
# next is the function name: I call mine "record check"
# in parentheses are the "arguments" - what will the function take in and use?
# I have one argument, which will be a number of wins.

def record_check(number_wins):
    
    bulls_record = 72
    
    # here I copy my code from above, but I changed "warriors_wins" to "number_wins"
    # We are also now using "return" instead of print()
    # Functions can *return* one or more values, which you can then use however you want
    if number_wins < bulls_record:
        return "Warriors did not break the record :("
    elif number_wins == bulls_record:
        return "Warriors tied the record."
    else:
        return "Warriors broke the record!"
# when I run this cell, the code above will be saved or "defined" as the function "record_check"

In [None]:
# now we can run the function with various numbers 

print(record_check(70))
print(record_check(72))

# We don't use the print() command here so you can see the difference between print()ing a string and 
# displaying it as output
record_check(74)

In [None]:
# run the function in a loop

win_list = [i for i in range(69, 76)]
# This is called a *list comprehension*. Super powerful. We don't have time to get into it,
# but definitely look up Python comprehensions when you get a chance.

print(win_list)
# Note that range(69, 76) includes 69 but not 76!
# Think: "start at 69 and end right before 76"

for i in win_list: # can also say simply "for i in range(69, 76):"
    print('With', i, 'wins, the', record_check(i))
    
# Consider how making our function return, rather than print, its strings enables us to
# combine those strings in a novel fashion within the for loop.

## Micro-Project

This should take five minutes or so:

* Create a function
  * It should check whether the number it takes as an argument equals 30
  * If it does not equal 30, it should print "This is not Steph Curry's number"
  * If it does equal 30, it should print "This is Steph Curry's number"
* Create a list of the numbers 23, 11, 30, 12, and 40
* Create a for loop with this list. Each iteration should run your new function using a number in this new list.


In [None]:
# Write your function here (change names as desired!):

def function_name(argument):
    # content of function here
    
    
# Create your list here:


# Write your loop here:


# Let's build a geocoder!

Now that we've covered the basics of programming in Python, we're going to jump into a more complex, real-world example. Step by step, we will assemble a program that will:
1. read in a list of addresses from a CSV file,
2. clean up the data formatting,
3. geocode the addresses, and
4. export the geocoded data to a new CSV.

A script like this, which improves on ArcGIS's temperamental geocoding operations, is exactly the kind of tool that a planner might create as part of a typical workflow. In other words, we've reached the realm of the actually useful.

In [None]:
# We import the packages we need for this task. If you followed the instructions 
# on installing Python and making sure everything is set up correctly, these commands
# should look familiar. If not, the syntax is simple:

import pandas as pd
from geopy.geocoders import GoogleV3

In [None]:
# For our geocoding demo, we'll use a table of Oakland libraries I found
# through Code for Oakland: http://codeforoakland.org/data-sets/

# Reading a CSV into a pandas DataFrame is easy:
libraries = pd.read_csv('LibraryBranches.csv')

# Let's look at the table
libraries

In [None]:
# When you have a large DataFrame, it can be more convenient to see only the first few rows:
libraries.head() # defaults to 5 rows, but you can specify the number of rows

In [None]:
# Unlike Excel, you can't directly look at and modify cells in a table.
# Instead, you access data by specifying its *location.* There are two basic forms
# of location syntax, .loc[] and .iloc[] - note the square brackets, not parentheses.

# .loc[x, y]: look up the data at the location with row index named x and column named y
print(libraries.loc[0,'PHONE'])

# .iloc[m, n]: look up the data in the mth row and nth column (first column == 0)
print(libraries.iloc[2, 2])

## Important concept: _objects_

It is common in Python and other programming languages to store data in flexible relational structures known as **objects**. Objects have **attributes**: for example, a `Person` object might have `height` and `age` numeric attributes, plus a `name` string attribute.

If an object has a function as an attribute, that function is called a **method** of the object. So our `Person` object can have a `get_older()` method that increases the Person's `height` attribute. (Note that like other, stand-alone functions, this method includes a set of parentheses.) In the cell above, `head()` is a method of the `libraries` object.

An object's methods and other attributes are accessed via **dot notation**: if `drew` is a `Person` object, `drew.height` will return `drew`'s `height` attribute. `drew.get_older()` will increase `drew`'s `age`.

We're not going to look at how to define our own objects (known also as *classes*) today, but it's a good skill to pick up, and tutorials are out there. https://jeffknupp.com/blog/2014/06/18/improve-your-python-python-classes-and-object-oriented-programming/

Right now, we're about to interact with three useful objects that are already defined in the modules we've loaded: `DataFrame`, from `pandas`, and `Geocoder` and `Location` from `geopy.geocoders`. Let's see how we can combine some `DataFrame` attributes and methods to quickly modify our `libraries` object.

In [None]:
# These ALL-CAPS column names are irritating. Let's change them to lowercase:

libraries.columns = libraries.columns.str.lower()
libraries.head()

# What happened here?
# libraries.columns is an attribute of libraries (a pandas DataFrame object).
# The columns object is itself an Index object, containing the names of each column.
# libraries.columns.str allows us to access and modify each string in libraries.columns -
# in this case, we use the lower() method to convert each string to lowercase.
# So we are telling Python to overwrite libraries.columns (the set of column names)
# with a version of libraries.columns converted to lowercase.

In [None]:
# Let's look at just one column

libraries['name']

# libraries.name (dot notation) is equivalent to libraries['name'] (bracket notation).
# However, when you want to create a new column, you must use bracket notation.

In [None]:
# We need to clean up our data in preparation for geocoding.
# There are some extra dashes in the address field, and the city and state aren't specified.
# Pandas provides some powerful methods for quickly modifying many pieces of data:

libraries['full_address'] = libraries.address.str.replace('- ', '') + ', Oakland, CA'
libraries.head()

# What happened here?
# We looked at libraries.address (one column), used str to access that column's
# contents as strings, then used the replace() string method to remove dashes.
# Then we used the + operator to concatenate city and state to each address.
# Finally, we used bracket notation to create a new full_address column to store
# the results of this operation.

In [None]:
# Now that we've properly formatted our addresses, it's time to set up the Geocoder.

g = GoogleV3()

# g is a GoogleV3 Geocoder object. This object has multiple methods, or built-in functions. 
# We'll be using the geocode() method, which takes a street address and returns a lat/long pair
# stored as a Location object. (Objects are just organized groups of data.)
# There is also a reverse() method that takes a lat/long pair and returns a street address.
# More info: https://geopy.readthedocs.org/en/1.11.0/#geopy.geocoders.GoogleV3
# and https://developers.google.com/maps/documentation/geocoding/intro
# The Google Maps Geocoding API has a limit of 2500 addresses per 24-hour period.

In [None]:
# Let's see the Geocoder in action. 
home = g.geocode('1001 Chanin Brkley')
home

# Notice that the Google Maps Geocoding API is quite flexible - it was able
# to identify Channing Way from "Chanin" and Berkeley, CA from "Brkley"!

In [None]:
# So, when we pass an address string to the Geocoder's geocode() method, the Geocoder
# looks up that address via the Google Maps Geocoding API and returns the result as a Location object.
# Now we work our way through the various components of this Location object.
print(home[0]) # matched address - a string
print(home[1]) # lat/long pair - a tuple
print(home[1][0]) # latitude - a float
print(home[1][1]) # longitude - a float

In [None]:
# OK! So we know how to get what we need out of the Location object that the Geocoder returns.
# We're now going to use a powerful feature of pandas - "apply," which applies a function to each
# item in a Series. 

# First, we need to define a function that takes an address string as a parameter,
# passes it to our Geocoder object, and returns the lat/long pair.

def getLatLong(address):
    result = g.geocode(address)
    return result[1]

In [None]:
# Now we can *apply* the function we defined to each address in the full_address column.
# As before, we use bracket notation to store the results of the apply operation into
# a new latlong column.

libraries['latlong'] = libraries.full_address.apply(getLatLong)
libraries.head()

In [None]:
# It is a good idea to keep the lat/long pair itself. This is a common format
# that is used in web mapping (e.g. Leaflet). However, for some applications
# (e.g. ArcGIS, CartoDB), we will need separate latitude and longitude fields.
# Let's create those now, using a for loop.

for i in libraries.index:
    libraries.loc[i, 'latitude'] = libraries.loc[i, 'latlong'][0]
    libraries.loc[i, 'longitude'] = libraries.loc[i, 'latlong'][1]

libraries.head()

# What happened here?
# libraries.index, an attribute of libraries, stores the set of index values associated with
# each row in the libraries DataFrame. 'for i in libraries.index' tells the variable i to
# iterate over each value in libraries.index, allowing us to access and modify each row in libraries.
# So, for each row in libraries, this code assigns the 0th element of the tuple in latlong
# to a new latitude column, and the 1st element of the latlong tuple to a new longitude column.
# Note, as before, how we use indentation to indicate which statements are part of the for loop.

In [None]:
# Almost there now. Before we export our data to a CSV file, let's remove
# some columns we don't need.

libraries.drop(['objectid', 'address'], inplace = True, axis = 1)
libraries.head()

# What happened here?
# The drop() method removes data from a DataFrame. Its first argument is a list
# of entries to remove. The inplace keyword specifies that the drop operation
# should modify libraries directly. (By default, inplace = False - you can always
# use libraries_new = libraries.drop(xyz, inplace = False) to save the modified
# DataFrame to libraries_new.) Axis is either 0 for rows or 1 for columns.
# In this case, we set it = 1 to drop columns, not rows.

In [None]:
# Time to save our data to disk. pandas DataFrames have a very convenient
# to_csv() method. The index = False argument tells pandas not to write the index
# as its own column, again eliminating unnecessary data.

libraries.to_csv('LibraryBranchesGeocoded.csv', index = False)

In [None]:
# Finally, let's collect all the pieces of the puzzle into a single cell.
# I have added some try/except syntax to handle situations in which an address string
# is totally invalid. Google will try hard to match each address, but if it can't, 
# the Geocoder will return None instead of a Location object.
# I also added three print() statements to inform the user about what the program
# has done. See if you can figure out how each statement uses attributes and methods!

# import the needed packages
import pandas as pd
from geopy.geocoders import GoogleV3

# read the CSV into a pandas DataFrame
libraries = pd.read_csv('LibraryBranches.csv')
print('Read', len(libraries), 'records')

# clean up the column names and address field
libraries.columns = libraries.columns.str.lower()
libraries['full_address'] = libraries.address.str.replace('- ', '') + ', Oakland, CA'

# in case you want to see how the program handles junk data, uncomment the next line
# libraries.loc[0, 'full_address'] = 'dkjfbga;knga;sd'

# create a geopy Geocoder
g = GoogleV3()

# define helper function
def getLatLong(address):
    result = g.geocode(address)
    try:
        return result[1]
    except:
        return None

# geocode the addresses
libraries['latlong'] = libraries.full_address.apply(getLatLong)
print('Geocoded', sum(libraries.latlong.notnull()), 'addresses')

# extract latitude and longitude
for i in libraries.index:
    try:
        libraries.loc[i, 'latitude'] = libraries.loc[i, 'latlong'][0]
        libraries.loc[i, 'longitude'] = libraries.loc[i, 'latlong'][1]
    except:
        pass

# drop unnecessary columns
libraries.drop(['objectid', 'address'], inplace = True, axis = 1)

# export the DataFrame to a new CSV file
libraries.to_csv('LibraryBranchesGeocoded.csv', index = False)
print('Exported', len(libraries), 'records')

# Not bad for 25 lines of code.