# Data Wrangling for Santa Clara Master Gardener Club by Michelle Petersen

In [None]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import json
import geojson
import shapefile
import requests
from bs4 import BeautifulSoup
import difflib
#import geopandas as gpd
import pprint as pp
import six.moves.urllib as urllib
import matplotlib.pyplot as plt
import re
from timeit import default_timer as timer

## Gather

In [None]:
df = gpd.read_file('ZillowNeighborhoods-CA/ZillowNeighborhoods-CA.shp')
print(df)

In [None]:
df.to_csv('California neighborhoods.csv')

In [None]:
sc_df = df[df['County'] == 'Santa Clara']
print(sc_df)

In [None]:
sc_df.to_csv('Santa Clara neighborhoods.csv')

In [None]:
sc_df.plot()

## Assess

In [None]:
import gspread
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession

# The scope to read the spreadsheet
_GS_SCOPE = ['https://www.googleapis.com/auth/spreadsheets.readonly',
            'https://www.googleapis.com/auth/drive']

# The ID and range of the MasterGardener spreadsheet.
_GS_ID = '1pPOuRZ9YsKwXGKgxzSm9tXCR_dgkCY4WNh3rg_cFX08'

service_account_info = json.load(open('Master Gardener-7e906e507d9d.json'))
credentials = service_account.Credentials.from_service_account_info(
    service_account_info,
    scopes=_GS_SCOPE)

if credentials:
    gc = gspread.Client(auth=credentials)
    gc.session = AuthorizedSession(credentials)
    book = gc.open_by_key(_GS_ID)
    print(book.title)
    worksheet_list = book.worksheets()
    for worksheet in worksheet_list:
        print(worksheet)

In [None]:
df_contacts = pd.DataFrame(book.get_worksheet(0).get_all_values())
df_contact_neighborhood = pd.DataFrame(book.get_worksheet(1).get_all_values())
df_zillow_neighborhoods = pd.DataFrame(book.get_worksheet(2).get_all_values())
df_nextdoor_neighborhoods = pd.DataFrame(book.get_worksheet(4).get_all_values())

In [None]:
df_contacts.head()

In [None]:
df_contacts.reset_index()
df_contacts.columns = ['Name', 'email', 'Residence - Nextdoor Neighborhood', 'Unknown Neighborhood', 'Notes']

In [None]:
df_contacts = df_contacts[df_contacts['Name'] != 'Name']

In [None]:
df_contacts['Residence - Nextdoor Neighborhood'] = df_contacts['Residence - Nextdoor Neighborhood'].str.title()

In [None]:
df_contacts.head()

In [None]:
df_contact_neighborhood.head()

In [None]:
df_contact_neighborhood.reset_index()
df_contact_neighborhood.columns = ['Name', 'Nextdoor Neighborhood', 'Unknown Neighborhood']

In [None]:
df_contact_neighborhood = df_contact_neighborhood[df_contact_neighborhood['Name'] != 'Name']

In [None]:
df_contact_neighborhood.head()

In [None]:
df_contact_neighborhood['Nextdoor Neighborhood'] = df_contact_neighborhood['Nextdoor Neighborhood'].str.title()

In [None]:
df_contact_neighborhood.head()

In [None]:
df_nextdoor_neighborhoods.head()

In [None]:
df_nextdoor_neighborhoods.reset_index()
df_nextdoor_neighborhoods.columns = ['State', 'County', 'City', 'Neighborhood', 'Link']

In [None]:
df_nextdoor_neighborhoods = df_nextdoor_neighborhoods[df_nextdoor_neighborhoods['Neighborhood'] != 'Neighborhood']

In [None]:
df_nextdoor_neighborhoods.head()

In [None]:
df_contact_neighborhood.head()

In [None]:
df_zillow_neighborhoods.head()

In [None]:
df_zillow_neighborhoods.reset_index()
df_zillow_neighborhoods.columns = ['State', 'County', 'City', 'Neighborhood', 'RegionID', 'Geometry']

In [None]:
df_zillow_neighborhoods = df_zillow_neighborhoods[df_zillow_neighborhoods['Neighborhood'] != 'Neighborhood']

In [None]:
df_zillow_neighborhoods.head()

In [None]:
df_contacts.to_csv('mg_contacts_master.csv', index=False)
df_contact_neighborhood.to_csv('mg_contact_neighborhood_master.csv', index=False)

## Scraping Nextdoor Neighborhoods

state = 'CA'
state_url = 'https://nextdoor.com/find-neighborhood/' + state + '/'

from requests import get
response = requests.get(state_url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

hood_city_group = html_soup.find_all('div', class_ = 'hood_group')
print(type(hood_city_group))
print(len(hood_city_group))

df_cities = pd.DataFrame(index=range(5000), columns = ['State', 'City', 'Link'])
df_loc = 0
df_cities.head(1)

for div in hood_city_group:
    city_links = div.findAll('a')
    for a in city_links:
        df_cities.iloc[df_loc, df_cities.columns.get_loc("State")] = 'CA'
        df_cities.iloc[df_loc, df_cities.columns.get_loc("City")] = a.string
        df_cities.iloc[df_loc, df_cities.columns.get_loc("Link")] = a['href'].strip()
        df_loc += 1

df_cities.dropna(axis = 0, inplace=True)
print(df_loc)

df_cities.reindex(columns=['State','County','City', 'Link'])
df_cities.head(1)

df_cities.to_csv('mg_nextdoor_california_cities.csv', index=False)

In [None]:
cities = ['San Jose','Santa Clara','Sunnyvale', 'Palo Alto','Mountain View',\
          'Cupertino','Milpitas','Los Gatos', 'Gilroy', 'Morgan Hill', 'Campbell',\
          'Los Altos','Saratoga', 'Stanford', 'Los Altos Hills',\
          'San Martin']

df_nextdoor_neighborhoods = pd.DataFrame(index=range(7000), columns = ['State', 'County', 'City', 'Neighborhood', 'Link'])
df_loc = 0
df_nextdoor_neighborhoods.head(1)

for city in cities:
    print(city)
    city_lookup = df_cities[df_cities['City'] == city]["Link"].values[0]
    print(city_lookup)
    response = requests.get(city_lookup)
    responseTxt = response.text.encode('UTF-8')
    html_soup = BeautifulSoup(responseTxt, 'html.parser')
    type(html_soup) 
    hood_neighborhood_group = html_soup.find_all('div', class_ = 'hood_group')
    print(type(hood_neighborhood_group))
    print(len(hood_neighborhood_group))
   
    for div in hood_neighborhood_group:
        neighborhood_links = div.findAll('a')
        for a in neighborhood_links:
            print(city + " " + a.string + " " +  a['href'])
            df_nextdoor_neighborhoods.iloc[df_loc, df_nextdoor_neighborhoods.columns.get_loc("State")] = 'CA'
            df_nextdoor_neighborhoods.iloc[df_loc, df_nextdoor_neighborhoods.columns.get_loc("County")] = 'Santa Clara'
            df_nextdoor_neighborhoods.iloc[df_loc, df_nextdoor_neighborhoods.columns.get_loc("City")] = city
            df_nextdoor_neighborhoods.iloc[df_loc, df_nextdoor_neighborhoods.columns.get_loc("Neighborhood")] = str(a.string.encode('UTF-8'))
            df_nextdoor_neighborhoods.iloc[df_loc, df_nextdoor_neighborhoods.columns.get_loc("Link")] = str(a['href'].strip())
            df_loc += 1

df_nextdoor_neighborhoods.dropna(axis = 0, inplace=True)
print(df_loc)

df_nextdoor_neighborhoods.head(3)

df_nextdoor_neighborhoods.to_csv('mg_nextdoor_neighborhoods_master.csv', index=False)

In [None]:
df_contact_neighborhood['Similar Nextdoor Neighborhood'] = 'None'
df_contacts['Similar Nextdoor Neighborhood'] = 'None'

In [None]:
neighborhoods = list(df_nextdoor_neighborhoods['Neighborhood'])
for index, row in df_contact_neighborhood.iterrows():
    row['Similar Nextdoor Neighborhood'] = difflib.get_close_matches(row['Nextdoor Neighborhood'].lower(), neighborhoods)

In [None]:
for index, row in df_contacts.iterrows():
    row['Similar Nextdoor Neighborhood'] = difflib.get_close_matches(row['Residence - Nextdoor Neighborhood'].lower(), neighborhoods)

In [None]:
df_contacts.head(1)

In [None]:
df_contact_neighborhood.head(1)

In [None]:
df_contact_neighborhood['Nextdoor Neighborhood'].nunique()

df_contact_neighborhood['Nextdoor Neighborhood'].value_counts()

df_contacts.to_csv('mg_contacts_master.csv', index=False)
df_contact_neighborhood.to_csv('mg_contact_neighborhood_master.csv', index=False)

_GM_API_KEY = '#####'

df_nextdoor_neighborhoods['Similar Google Neighborhood'] = 'None'
df_nextdoor_neighborhoods['Google Place ID'] = 'None'
#df_nextdoor_neighborhoods[Google Location = 'None'

import six.moves.urllib as urllib

for index, row in df_nextdoor_neighborhoods.iterrows():
    neighborhoodquery = urllib.parse.quote(row["Neighborhood"] + ', ' +  row["City"] + ', ' +  row["State"], safe='')
    print(neighborhoodquery)
    response = requests.get('https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=' + neighborhoodquery + '&inputtype=textquery&fields=name,id,place_id,plus_code,types,geometry&key=' + _GM_API_KEY)
    resp_json_payload = response.json()
    print(resp_json_payload)                                           
    for k, v in resp_json_payload.items():
        print(k)
        if ((k == 'candidates') & (len(v) > 0)):
            for pair in v:
                if (('neighborhood' == pair['types'][0]) or ('locality' in pair['types'][0])):
                    if ('political' == pair['types'][1]):
                        print(pair['name'])
                        print(pair['place_id'])
                        row["Similar Google Neighborhood"] = pair['name']
                        row["Google Place ID"] = pair['place_id']

df_nextdoor_neighborhoods.to_csv('mg_nextdoor_with_google_master.csv', index=False)

## Parsing geoJSON

In [None]:
# Load the twitter archive data that was downloaded manually from Udacity
df_nextdoor_neighborhoods = pd.read_csv("mg_nextdoor_neighborhood_fulllist.csv")
df_nextdoor_neighborhoods.head(1)

In [None]:
df_nextdoor_neighborhoods["Nextdoor ID"] = 0
df_nextdoor_neighborhoods["Geometry"] = ""
df_nextdoor_neighborhoods["Interests"] = ""
df_nextdoor_neighborhoods["Gardening Interest"] = 0
df_nextdoor_neighborhoods["Percentage of Homeowners"] = 0
df_nextdoor_neighborhoods["Number of Residents"] = 0
df_nextdoor_neighborhoods["Average Age"] = 0
df_nextdoor_neighborhoods['Attributes'] = ""

In [None]:
df_nextdoor_neighborhoods["Link"].fillna("", inplace=True)
df_nextdoor_neighborhoods["Geometry"].fillna("", inplace=True)
df_nextdoor_neighborhoods["Interests"].fillna("", inplace=True)
df_nextdoor_neighborhoods["Attributes"].fillna("", inplace=True)

In [None]:
df_nextdoor_neighborhoods.info()

In [None]:
df_nextdoor_neighborhoods.describe()

In [None]:
def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches

In [None]:
cities = ['San Jose','Santa Clara','Sunnyvale', 'Palo Alto','Mountain View',\
          'Cupertino','Milpitas','Los Gatos', 'Gilroy', 'Morgan Hill', 'Campbell',\
          'Los Altos','Saratoga', 'Stanford', 'Los Altos Hills',\
          'San Martin']

In [None]:
def findAllTheNearbyNeighborhoods(neighborhood_info):
    print("findAllTheNearbyNeighborhoods")
    global df_nextdoor_neighborhoods
    nearbyNeighborhoods = neighborhood_info[neighborhood_info.index('nearbyNeighborhoods: '): neighborhood_info.index('neighborhoodGeometriesJSON:')]
    nearbyNeighborhoods = nearbyNeighborhoods.lstrip()
    nearbyNeighborhoods = nearbyNeighborhoods.rstrip()
    begIndex = len('nearbyNeighborhoods: ')
    nearbyNeighborhoods = nearbyNeighborhoods[begIndex: -1]
    nearby_neighborhood_json = json.loads(nearbyNeighborhoods)
    for nearby_neighborhood in nearby_neighborhood_json:
        if (nearby_neighborhood['city'] in cities):
            nearby_neighborhood['shortName'] = re.sub('\s+', ' ', nearby_neighborhood['shortName']).strip()
            no_entry = df_nextdoor_neighborhoods.loc[df_nextdoor_neighborhoods['Neighborhood'] == nearby_neighborhood['shortName']]
            print(nearby_neighborhood)
            if (len(no_entry) == 0):
                print("Appending: ", nearby_neighborhood)
                df_nextdoor_neighborhoods = df_nextdoor_neighborhoods.append(
                    {"State": nearby_neighborhood['state'],
                     "County": "Santa Clara",
                     "City": nearby_neighborhood['city'],
                     "Neighborhood": nearby_neighborhood['shortName'],
                     "Link": "",
                     "Nextdoor ID": 0,
                     "Geometry": "",
                     "Interests": "",
                     "Gardening Interest": 0,
                     "Percentage of Homeowners": 0,
                     "Number of Residents": 0,
                     "Average Age": 0,
                     "Attributes": ""
                    }, ignore_index=True)
    print("findAllTheNearbyNeighborhoods")

In [None]:
# Find the id of the neighborhood the page is for
def findTheNeighborhoodID(currentRowIndex, neighborhood_info):
    print("findTheNeighborhoodID")
    global df_nextdoor_neighborhoods
    neighborhoodMap = neighborhood_info[neighborhood_info.index('neighborhoodMapOptions:{'): -1]
    neighborhoodMap = neighborhoodMap.lstrip()
    neighborhoodMap = neighborhoodMap.rstrip()
    begIDIndex = list(find_all(neighborhoodMap, "hoodId: "))   
    assert(len(begIDIndex) == 1)
    endIDIndex = begIDIndex[0] + len ("hoodId: ") + 10
    hoodId = neighborhoodMap[begIDIndex[0] + len ("hoodId: "): endIDIndex]
    hoodId = hoodId.lstrip()
    hoodId = hoodId.rstrip()
    print(hoodId)
    hoodId = re.findall(r'^([\s\d]+)$', hoodId)
    print(hoodId)
    assert(len(hoodId) == 1)
    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Nextdoor ID")] = int(hoodId[0])
    print("findTheNeighborhoodID")

In [None]:
# Iterate over each neighborhood and update the URL and ID
def findAllTheNeighborhoodIDs(neighborhood_info):
    print("findAllTheNeighborhoodIDs")
    global df_nextdoor_neighborhoods
    neighborhoodMap = neighborhood_info[neighborhood_info.index('neighborhoodMapOptions:{'): -1]
    neighborhoodMap = neighborhoodMap.lstrip()
    neighborhoodMap = neighborhoodMap.rstrip()
    #print(neighborhoodMap)
    begURLIndexes = list(find_all(neighborhoodMap, "\"page_url\": \""))  
    endURLIndexes = list(find_all(neighborhoodMap, "\", \"short_name\": ")) 
    print(begURLIndexes)
    print(endURLIndexes)
    begShortNameIndexes = list(find_all(neighborhoodMap, "\"short_name\": \"")) 
    endShortNameIndexes = list(find_all(neighborhoodMap, "\", \"stroke_color\": ")) 
    print(begShortNameIndexes)
    print(endShortNameIndexes)
    begIDIndexes = list(find_all(neighborhoodMap, "\"id\": ")) 
    endIDIndexes = list(find_all(neighborhoodMap, ", \"geometry\": \"{")) 
    print(begIDIndexes)
    print(endIDIndexes)
    if ((len(begURLIndexes) == len(endURLIndexes)) and 
        (len(begShortNameIndexes) == len(endShortNameIndexes)) and 
        (len(begIDIndexes) == len(endIDIndexes))):
        for begShort, endShort, begURL, endURL, begID, endID in zip(begShortNameIndexes, endShortNameIndexes, 
                                                               begURLIndexes, endURLIndexes, 
                                                               begIDIndexes, endIDIndexes):
            print(begShort, endShort)
            print(begURL, endURL)
            print(begID, endID)
            shortname = neighborhoodMap[begShort + len ("\"short_name\": \""): endShort]
            shortname = re.sub('\s+', ' ', shortname).strip()
            print(shortname)
            entry = df_nextdoor_neighborhoods.loc[df_nextdoor_neighborhoods['Neighborhood'] == shortname]
            if (len(entry) == 1):
                pp.pprint(entry)
                index = int(df_nextdoor_neighborhoods.index[df_nextdoor_neighborhoods['Neighborhood'] == shortname][0])
                print(df_nextdoor_neighborhoods.iloc[index])
                if (len(df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Link")]) == 0):
                    page_url = neighborhoodMap[begURL + len ("\"page_url\": \""): endURL]
                    print("Updating Page URL: " + page_url)
                    if (len(page_url) > 0):
                        df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Link")] = page_url
                if (df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Nextdoor ID")] == 0): 
                    neighborhood_id = neighborhoodMap[begID + len ("\"id\": "): endID]
                    print("ID: " + neighborhood_id)
                    df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Nextdoor ID")] = int(neighborhood_id)
                print(df_nextdoor_neighborhoods.iloc[index])
                print("findAllTheNeighborhoodIDs loop")                                
    print("findAllTheNeighborhoodIDs")

In [385]:
# Iterate over each feature and update the row for that neighborhood name
def findAllTheNeighborhoodFeatures(neighborhood_info):
    print("findAllTheNeighborhoodFeatures")
    global df_nextdoor_neighborhoods
    neighborhoodGeometries = neighborhood_info[neighborhood_info.index('neighborhoodGeometriesJSON: '): neighborhood_info.index('neighborhoodMapOptions:{')]
    neighborhoodGeometries = neighborhoodGeometries.lstrip()
    neighborhoodGeometries = neighborhoodGeometries.rstrip()
    begIndex = len('neighborhoodGeometriesJSON: ')
    neighborhoodGeometries = neighborhoodGeometries[begIndex: -1]
    neighborhood_json = json.loads(neighborhoodGeometries)
    for feature in neighborhood_json["features"]:
        props = feature["properties"]
        props["hood_name"] = re.sub('\s+', ' ', props['hood_name']).strip()
        print("Name: %s, ID: %s \n" % (props["hood_name"], props["hood_id"]))
        entry = df_nextdoor_neighborhoods.loc[(df_nextdoor_neighborhoods['Neighborhood'] == props["hood_name"]) & 
                                              (df_nextdoor_neighborhoods['Nextdoor ID'] == props["hood_id"])]
        print(entry)
        if (len(entry) == 1):
            index = int(df_nextdoor_neighborhoods.index[df_nextdoor_neighborhoods['Neighborhood'] == props["hood_name"]][0])
            print(df_nextdoor_neighborhoods.iloc[index])
            #Save the feature for that neighborhood without the featurecollection 
            print("Geometry: " + str(df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Geometry")]))
            if (len(df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Geometry")]) == 0):
                feature['properties']['State'] = df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("State")]
                feature['properties']['City'] = df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("City")]
                feature['properties']['County'] = "Santa Clara"
                df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Geometry")] = feature
            if (df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Nextdoor ID")] == 0): 
                df_nextdoor_neighborhoods.iat[index, df_nextdoor_neighborhoods.columns.get_loc("Nextdoor ID")] = props["hood_id"]
            print(df_nextdoor_neighborhoods.iloc[index])
        print("findAllTheNeighborhoodFeatures loop")
    print("findAllTheNeighborhoodFeatures")

In [None]:
def iterateOneNeighborhood(currentRowIndex):
    neighborhood_info = ""
    if len(df_nextdoor_neighborhoods.iloc[currentRowIndex]["Link"]) > 0:
        neighborhood_lookup = df_nextdoor_neighborhoods.iloc[currentRowIndex]["Link"]
        response = requests.get(neighborhood_lookup)
        responseTxt = response.text.encode('UTF-8')
        html_soup = BeautifulSoup(responseTxt, 'html.parser')
     
        try:
            # find the variable with all the neighborhood info
            scripts = html_soup.findAll('script')
            neighborhood_info = scripts[5].string.strip()
            neighborhood_info = neighborhood_info[neighborhood_info.index('['):-1]
            
            findTheNeighborhoodID(currentRowIndex, neighborhood_info)
            pp.pprint(df_nextdoor_neighborhoods.iloc[currentRowIndex])
        
            findAllTheNearbyNeighborhoods(neighborhood_info)
            pp.pprint(df_nextdoor_neighborhoods.iloc[currentRowIndex])

            findAllTheNeighborhoodIDs(neighborhood_info)
            pp.pprint(df_nextdoor_neighborhoods.iloc[currentRowIndex])

            findAllTheNeighborhoodFeatures(neighborhood_info)
            pp.pprint(df_nextdoor_neighborhoods.iloc[currentRowIndex])

            # Update with the interest info
            try:
                begInterestIndex = neighborhood_info.index('interests: [')
                endInterestIndex = neighborhood_info.index('iosUrl: "https://')
                if ((begInterestIndex > 0) and (endInterestIndex > 0)):
                    interests = neighborhood_info[begInterestIndex: endInterestIndex]
                    interests = interests[interests.index('[')+1: interests.index(']')]
                    interests = interests.replace('"', "")
                    interest_list = interests.split(", ")
                    print(interest_list)
                    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Interests")] = interest_list
                    if ('Gardening & Landscape' in interest_list):
                        df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Gardening Interest")] = interest_list.index('Gardening & Landscape')
            except:
                print("No interests found for: ", df_nextdoor_neighborhoods.iloc[currentRowIndex]["Neighborhood"])
        
            # Update with the attribute info
            try:
                begAttrIndex = neighborhood_info.index('attributes: [')
                endAttrIndex = neighborhood_info.index('census:')
                if ((begAttrIndex > 0) and (endAttrIndex > 0)):
                    attributes = neighborhood_info[begAttrIndex: endAttrIndex]
                    attributes = attributes[attributes.index('[')+1: attributes.index(']')]
                    attributes = attributes.replace('"', "")
                    attributes_list = attributes.split(", ")
                    print(attributes_list)
                    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Attributes")] = attributes_list
            except:
                print("No attributes found for: ", df_nextdoor_neighborhoods.iloc[currentRowIndex]["Neighborhood"])
           
            # Update with the census info
            try:
                begCensusIndex = neighborhood_info.index('census: {')
                endCensusIndex = neighborhood_info.index('city: ')
                if ((begCensusIndex > 0) and (endCensusIndex > 0)):
                    census = neighborhood_info[begCensusIndex: endCensusIndex]
                    census = census[census.index('{'): census.index('}')+1]
                    census = json.loads(census)
                    print("Census: %s\n" % str(census))
                    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Percentage of Homeowners")] = int(census['homeowners'])
                    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Number of Residents")] = int(census['population'])
                    df_nextdoor_neighborhoods.iat[currentRowIndex, df_nextdoor_neighborhoods.columns.get_loc("Average Age")] = int(census['age'])
            except:
                print("No census found for: ", df_nextdoor_neighborhoods.iloc[currentRowIndex]["Neighborhood"])
        
        except:
            print("Error parsing")
    return (neighborhood_info)

In [386]:
neighborhood_info = iterateOneNeighborhood(464)

findTheNeighborhoodID
185699
['185699']
findTheNeighborhoodID
State                                                                      CA
County                                                            Santa Clara
City                                                                Sunnyvale
Neighborhood                                                       Las Palmas
Link                        https://nextdoor.com/neighborhood/laspalmasca-...
Nextdoor ID                                                            185699
Geometry                                                                     
Interests                   [Hiking & Trails, Gardening & Landscape, Walki...
Gardening Interest                                                          1
Percentage of Homeowners                                                   42
Number of Residents                                                      3477
Average Age                                                                48
At

445  [Family Friendly, Peaceful, Safe, Neighbors, Q...  
State                                                                      CA
County                                                            Santa Clara
City                                                                Sunnyvale
Neighborhood                                                     Cherry Chase
Link                        https://nextdoor.com/neighborhood/cherrychase-...
Nextdoor ID                                                            193410
Geometry                    {'type': 'Feature', 'geometry': {'type': 'Mult...
Interests                   [Hiking & Trails, Home Improvement & DIY, Gard...
Gardening Interest                                                          2
Percentage of Homeowners                                                   73
Number of Residents                                                      4067
Average Age                                                                52
Attribu

In [None]:
findAllTheNeighborhoodIDs(neighborhood_info)

In [387]:
findAllTheNeighborhoodFeatures(neighborhood_info)

findAllTheNeighborhoodFeatures
Name: Las Palmas, ID: 185699 

    State       County       City Neighborhood  \
464    CA  Santa Clara  Sunnyvale   Las Palmas   

                                                  Link  Nextdoor ID Geometry  \
464  https://nextdoor.com/neighborhood/laspalmasca-...       185699            

                                             Interests  Gardening Interest  \
464  [Hiking & Trails, Gardening & Landscape, Walki...                   1   

     Percentage of Homeowners  Number of Residents  Average Age  \
464                        42                 3477           48   

                                            Attributes  
464  [Family Friendly, Walkability, Friendly, Parks...  
State                                                                      CA
County                                                            Santa Clara
City                                                                 San Jose
Neighborhood                        

In [382]:
print(neighborhood_info)

[{"agencyName": "Department of Public Safety", "hostname": "laspalmasca", "subject": "Sunnyvale Police Activities League\u2019s Annual Dodge Ackerman Memorial Bouts", "postId": "102549591", "urlSlug": "department-of-public-safety-1", "avatarPhoto": "https://d3926qxcw0e1bh.cloudfront.net/pages_avatar_photos/6a/e0/6ae0e732d244e951e886ea245f4494c7.png", "groupId": "656706"}, {"agencyName": "Santa Clara Valley Water District", "hostname": "laspalmasca", "subject": "Santa Clara Valley Water District is now Valley Water", "postId": "102546798", "urlSlug": "santa-clara-valley-water-district-1", "avatarPhoto": "https://d3926qxcw0e1bh.cloudfront.net/pages_avatar_photos/9e/06/9e06ad113a526db9e023ef8b59bea698.jpg", "groupId": "1458946"}, {"agencyName": "Santa Clara Valley Water District", "hostname": "laspalmasca", "subject": "Santa Clara Valley Water District is now Valley Water", "postId": "102510087", "urlSlug": "santa-clara-valley-water-district-1", "avatarPhoto": "https://d3926qxcw0e1bh.clou

In [None]:
df_nextdoor_neighborhoods["Neighborhood"].nunique()

In [380]:
pp.pprint(df_nextdoor_neighborhoods.iloc[464])

State                                                                      CA
County                                                            Santa Clara
City                                                                Sunnyvale
Neighborhood                                                       Las Palmas
Link                        https://nextdoor.com/neighborhood/laspalmasca-...
Nextdoor ID                                                            185699
Geometry                                                                     
Interests                   [Hiking & Trails, Gardening & Landscape, Walki...
Gardening Interest                                                          1
Percentage of Homeowners                                                   42
Number of Residents                                                      3477
Average Age                                                                48
Attributes                  [Family Friendly, Walkability, Frien

In [None]:
df_nextdoor_neighborhoods.head(10)

In [None]:
df_nextdoor_neighborhoods.tail(10)

In [388]:
df_nextdoor_neighborhoods.to_csv('mg_nextdoor_withprops_master.csv', index=False)

In [None]:
currentIndex = 612
#endIndex = (len(df_nextdoor_neighborhoods)/2) + currentIndex)
endIndex = len(df_nextdoor_neighborhoods)
print("Number of Neighborhoods: " + str(len(df_nextdoor_neighborhoods)))
print("End Index: " , str(endIndex))
while True:
    print("Current Index: " + str(currentIndex))
    iterateOneNeighborhood(currentIndex) 
    currentIndex += 1
    if currentIndex > endIndex:
        break

### Quality

##### Data Quality Dimensions
Completeness: Do we have all of the records that we should? Do we have missing records or not? Are there specific rows, columns, or cells missing? 

Validity: We have the records, but they're not valid, i.e., they don't conform to a defined schema. A schema is a defined set of rules for data. These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables). 

Accuracy: Inaccurate data is wrong data that is valid. It adheres to the defined schema, but it is still incorrect. Example: a patient's weight that is 5 lbs too heavy because the scale was faulty. 

Consistency: Inconsistent data is both valid and accurate, but there are multiple correct ways of referring to the same thing. Consistency, i.e., a standard format, in columns that represent the same data across tables and/or within tables is desired. 

##### `df_tweets` Table Actions 
- Blah Blah
##### `df_tweet_additional_info` Table Actions
- Remove retweets.
- 

### Tidiness
##### Tidiness Criteria:
- Each variable you measure should be in one column.
- Each different observation of that variable should be in a different row.
- There should be one table for each “kind” of variable.
- Related tables should each include a column that allows them to be linked.
##### Tidiness Actions
- 

## Clean

##### Define
Remove tweets from the df_tweets and df_tweet_additional_info tables that do not have images in the df_image_predict table.

##### Code

##### Test

## Storing

Store the clean DataFrame in a CSV file named SantaClaraMasterGardener_master.csv.

## Analysis and Visualization

### Viz 1: