In [1]:
import pandas as pd

In [3]:
# import the demographics data
demographics = pd.read_csv('demographics.csv')

In [4]:
# extract the columns to be reused in zipcodes.csv 
zipcodes = demographics[['ID', 'COUNTY', 'STNAME', 'NAME']]
zipcodes.head()

Unnamed: 0,ID,COUNTY,STNAME,NAME
0,0,NZ-NTL,Northland,North Cape
1,1,NZ-NTL,Northland,Rangaunu Harbour
2,2,NZ-NTL,Northland,Karikari Peninsula
3,3,NZ-NTL,Northland,Tangonge
4,4,NZ-NTL,Northland,Ahipara


In [5]:
# install GeoPy
!pip install geopy 
!pip install Nominatim

Collecting Nominatim
  Downloading nominatim-0.1.tar.gz (1.7 kB)
Building wheels for collected packages: Nominatim
  Building wheel for Nominatim (setup.py) ... [?25l[?25hdone
  Created wheel for Nominatim: filename=nominatim-0.1-py3-none-any.whl size=2363 sha256=e5a92e3d8382848214d742d2c44ec2f295534f04960bae849e703705972ffcbc
  Stored in directory: /root/.cache/pip/wheels/37/00/9e/d904c390bfb174830ad3dcfd62af5544cee7d88bed4f8acedd
Successfully built Nominatim
Installing collected packages: Nominatim
Successfully installed Nominatim-0.1


In [6]:
# I can use geopy to try and extract the postcodes and coordinates
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Your_Name")
city ="Tangonge"
country ="New Zealand"
loc = geolocator.geocode(city+','+ country)
print(loc.address)

Tangonge Drain, Kaitaia, Te Hiku Community, Far North District, Northland, 0410, New Zealand / Aotearoa


In [7]:
# I can use regex to extract the postcode
import re
teststring = 'Tangonge Drain, Kaitaia, Te Hiku Community, Far North District, Northland, 0410, New Zealand / Aotearoa'
print(re.findall('[0-9]+', teststring)[0])

0410


In [8]:
# add empty rows for the data I am looking for
zipcodes['ZCTA5'] = ''
zipcodes['LAT'] = ''
zipcodes['LON'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
# this can take a bit of time (around 19 minutes)
# it may also timeout, in which case it can be run again
# it will only search for addresses that have not been found yet

# iterate over all areas
# get the address and if one is found, postcode and coordinates
for index, row in zipcodes.iterrows():
  # if no postcode has been found yet
  if row['ZCTA5'] == '':
  #if pd.isna(row['ZCTA5']):
    # get the location
    location = geolocator.geocode(row['NAME']+','+ "New Zealand", timeout=10)
    postcode = ''
    lat = ''
    lon = ''
    if location != None:
      postcode = re.findall('[0-9]+', location.address)
      if len(postcode) != 0:
        postcode = postcode[0]
      lat = location.latitude
      lon = location.longitude
    # if location cannot be found, try some other possibilities     
    if location == None:
      try:
        area = row['NAME'].split('-')[0]
        location = geolocator.geocode(area+','+ "New Zealand", timeout=10)
        if location != None:
          postcode = re.findall('[0-9]+', location.address)
          if len(postcode) != 0:
            postcode = postcode[0]
          lat = location.latitude
          lon = location.longitude
      except Exception:
        pass

    # add the found data to the relevant columns
    zipcodes.at[index,'ZCTA5'] = postcode
    zipcodes.at[index,'LAT'] = lat
    zipcodes.at[index,'LON'] = lon

As not every address has a postcode, I am expecting more latitude and longitude fields to be filled than postcode.

In [10]:
# check how many area were not located:
print(f"Overall there are {len(zipcodes)} areas")
print(f"Areas with no postode: {len(zipcodes[zipcodes['ZCTA5'] == ''])}")
print(f"Areas with no latitude: {len(zipcodes[zipcodes['LAT'] == ''])}")
print(f"Areas with no longitude: {len(zipcodes[zipcodes['LON'] == ''])}")

Overall there are 2135 areas
Areas with no postode: 73
Areas with no latitude: 73
Areas with no longitude: 73


After this, I ended up with 73 areas with no postcode and 74 areas with no coordinates.

In [13]:
# Let's see them
zipcodes[zipcodes['ZCTA5'] == '']

Unnamed: 0,ID,COUNTY,STNAME,NAME,ZCTA5,LAT,LON
105,105,NZ-OUT,Area Outside,Inlets Far North District,,,
108,108,NZ-OUT,Area Outside,Kaipara Coastal,,,
109,109,NZ-OUT,Area Outside,Maungaru,,,
110,110,NZ-OUT,Area Outside,Okahukura Peninsula,,,
111,111,NZ-OUT,Area Outside,Tawharanui Peninsula,,,
...,...,...,...,...,...,...,...
181,181,NZ-OUT,Area Outside,Oamaru North Milner Park,,,
182,182,NZ-OUT,Area Outside,Oamaru North Orana Park,,,
183,183,NZ-OUT,Area Outside,Wakatipu Basin,,,
185,185,NZ-OUT,Area Outside,Prestonville-Grasmere,,,


Given the small number, I can amend them manually.

In [14]:
for index, row in zipcodes.iterrows():
  # if no postcode has been found yet
  if row['ZCTA5'] == '':
    print(f"The name of the place is: {row['NAME']}")
    postcode = input('Postcode:')
    lat = input('Latitude:')
    lon = input('Longitude:')
    # add the found data to the relevant columns
    zipcodes.at[index,'ZCTA5'] = postcode
    zipcodes.at[index,'LAT'] = lat
    zipcodes.at[index,'LON'] = lon

The name of the place is: Inlets Far North District


KeyboardInterrupt: ignored

In [15]:
zipcodes

Unnamed: 0,ID,COUNTY,STNAME,NAME,ZCTA5,LAT,LON
0,0,NZ-NTL,Northland,North Cape,[],-34.416,173.052
1,1,NZ-NTL,Northland,Rangaunu Harbour,0486,-34.9542,173.284
2,2,NZ-NTL,Northland,Karikari Peninsula,[],-34.8939,173.335
3,3,NZ-NTL,Northland,Tangonge,0410,-35.12,173.234
4,4,NZ-NTL,Northland,Ahipara,0481,-35.1755,173.147
...,...,...,...,...,...,...,...
2130,2130,NZ-NSN,Nelson,Grampians,7071,-41.298,173.28
2131,2131,NZ-NSN,Nelson,Enner Glynn,7011,-41.3064,173.257
2132,2132,NZ-NSN,Nelson,The Brook,7071,-41.3002,173.295
2133,2133,NZ-NSN,Nelson,Nelson Creek,[],-42.4064,171.518


In [16]:
# the areas with no postcode identified have an empty list instead
# I want to replace it with an empty string
zipcodes['ZCTA5'] = zipcodes['ZCTA5'].apply(lambda y: '' if len(y)==0 else y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
# Let's see it if worked
zipcodes[zipcodes["ZCTA5"].str.len() < 3]

Unnamed: 0,ID,COUNTY,STNAME,NAME,ZCTA5,LAT,LON
0,0,NZ-NTL,Northland,North Cape,,-34.416,173.052
2,2,NZ-NTL,Northland,Karikari Peninsula,,-34.8939,173.335
7,7,NZ-NTL,Northland,Oruru-Parapara,,-35.0532,173.512
8,8,NZ-NTL,Northland,Taumarumaru,,-34.99,173.502
9,9,NZ-NTL,Northland,Herekino-Takahue,,-35.2654,173.2
...,...,...,...,...,...,...,...
2115,2115,NZ-NSN,Nelson,Nelson Park,,-39.4984,176.91
2118,2118,NZ-NSN,Nelson,Tahunanui,,-41.2822,173.234
2121,2121,NZ-NSN,Nelson,Broadgreen-Monaco,,-41.3089,173.229
2129,2129,NZ-NSN,Nelson,Maitai,,-41.3014,173.4


In [18]:
# finally, rename the columns to match the format
zipcodes = zipcodes.rename(columns={'ID':'', 'COUNTY':'ST','STNAME':'USPS'})
# final look at the data
zipcodes

Unnamed: 0,Unnamed: 1,ST,USPS,NAME,ZCTA5,LAT,LON
0,0,NZ-NTL,Northland,North Cape,,-34.416,173.052
1,1,NZ-NTL,Northland,Rangaunu Harbour,0486,-34.9542,173.284
2,2,NZ-NTL,Northland,Karikari Peninsula,,-34.8939,173.335
3,3,NZ-NTL,Northland,Tangonge,0410,-35.12,173.234
4,4,NZ-NTL,Northland,Ahipara,0481,-35.1755,173.147
...,...,...,...,...,...,...,...
2130,2130,NZ-NSN,Nelson,Grampians,7071,-41.298,173.28
2131,2131,NZ-NSN,Nelson,Enner Glynn,7011,-41.3064,173.257
2132,2132,NZ-NSN,Nelson,The Brook,7071,-41.3002,173.295
2133,2133,NZ-NSN,Nelson,Nelson Creek,,-42.4064,171.518


## Save as CSV

In [19]:
zipcodes.to_csv('zipcodes.csv', index=False)