# Clean Drawings

This code fixes an issue with Williamsbridge drawings largely (25 - 17) representing the area of Williamsburg, presumably due to user error. It also recodes the 'Other' category, firstly by classifying 'Other' neighborhoods that are represented in the named neighborhoods field ('otherNeighborhood') and secondly by producing new neighborhoods where appropriate (5 or more drawings exist). 'Other' neighborhoods that are not named, are comments (i.e. 'My House'), relate to inappropriate values, are out of the geographical context of New York City (i.e. Hempstead'), or seem plausible but are too few in total number (i.e. RAMBO) are removed.

In [1]:
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import shapely

%matplotlib inline

In [280]:
# DNAInfo Data
ny_data = r'C:\Users\djl543\OneDrive\Draw-Your-Neighborhood-master\NYC_prep_wgs84.geojson'
#ny_data = r'C:\Users\Dan\OneDrive\Draw-Your-Neighborhood-master\NYC_raw_wgs84.geojson'
ny_nhoods = gpd.read_file(ny_data)

# Set the crs of the input geojson to WGS84
ny_nhoods.crs = {'init': 'epsg:4326'}

# Tranform to projected coordinate system - EPSG:32118 - New York Long Island, NAD83-based projection in metres.
ny_nhoods = ny_nhoods.to_crs({'init':'epsg:32118'})

ny_nhoods.head()

Unnamed: 0,geometry,neighborhoodLive,otherNeighborhood,shapeID,yearsLived
0,"POLYGON ((311834.6421283125 77460.83003325062,...",Allerton,,259,2
1,"POLYGON ((311305.0904452928 78317.74772849762,...",Allerton,,298,0
2,"POLYGON ((311189.369271409 78288.7412321083, 3...",Allerton,,5964,2
3,"POLYGON ((311199.2696228614 77331.72545917239,...",Allerton,,59957,15
4,"POLYGON ((311137.8304619553 78879.66082990178,...",Allerton,,61660,1


In [281]:
# Firstly, let's create a new field and initialise it with the 'neighborhoodLive' values.
ny_nhoods['nhood'] =  ny_nhoods['neighborhoodLive']

In [282]:
# Now update the Williamsburg Williamsbridges.
ny_nhoods.loc[(ny_nhoods['neighborhoodLive'] == "Williamsbridge") & (ny_nhoods['geometry'].centroid.bounds['maxy'] < 70000),'nhood'] = "Williamsburg"

In [283]:
Other = ny_nhoods[ny_nhoods['neighborhoodLive'] == "Other"]

In [284]:
# Construct a manual lookup to recode misspelling and pre-existing names
recode = {'Alphabet City': 'Alphabet City','Alphabet city':'Alphabet City', 'alphabet city':'Alphabet City',
          'Bath Beach':'Bath Beach','Bath beach ':'Bath Beach', 'Beechhurst':'Beechhurst',
          'Belle Harbor ':'Belle Harbor', 'Bensonhurst': 'Bensonhurst','Boerum Hill': 'Boerum Hill',
          'Central Harlem':'Central Harlem','Clinton':'Hell\'s Kitchen / Clinton','Concourse':'Concourse',
          'Crown Heights':'Crown Heights','Flushing':'Flushing','Forte george':'Fort George','Gerritsen beach':'Gerritsen Beach',
          'Gowanus':'Gowanus','Greenwood Heights':'Greenwood Heights','Hudson Heights':'Hudson Heights',
          'Hells Kitchen':'Hell\'s Kitchen / Clinton', 'Inwood': 'Inwood','Inwood Manhattan':'Inwood',
          'Kensington':'Kensington', 'Kew Gardens':'Kew Gardens','Kips Bay':'Kips Bay', 'Mahattam Valley':'Manhattan Valley',
          'Marble Hill':'Marble Hill','Marine Park':'Marine Park','Marine park ':'Marine Park','Midland beach':'Midland Beach',
          'Midtown east':'Midtown East','Midwood ':'Midwood','NoMad':'NoMad','Peter Cooper Village':'Stuyvesant Town',
          'Peter Cooper Village Stuyvesant Town':'Stuyvesant Town','ProspectLefferts Garden':'Prospect-Lefferts Gardens',
          'SoundView':'Soundview','South ozone park':'South Ozone Park','South slope':'South Slope ','St George':'St. George',
          'st George ': 'St. George','Stuyvesant Town ':'Stuyvesant Town','They call it Flatiron but its not':'Flatiron',
          'Thuglaston':'Douglaston','Windsor terrace':'Windsor Terrace','bedstuy':'Bedford-Stuyvesant',
          'carnegie hill':'Carnegie Hill','central harlem':'Central Harlem','little italy':'Little Italy',
          'nomad':'NoMad','riverdale':'Riverdale','upper west side':'Upper West Side','woodlawn':'Woodlawn',
          'Mahattam Valley':'Manhattan Valley', 'Seaport':'South Street Seaport','University heights':'University Heights',
         'Saint Albans': 'St. Albans'}

print "Recoded drawings:", Other[Other['otherNeighborhood'].isin(recode.keys())].count()[0]

# Recode the 'nhood' field to reflect pre-existing 'Other' neighborhoods
for i in recode.keys():
    # Recode in main dataset
    ny_nhoods.loc[ny_nhoods['otherNeighborhood']==i,'nhood'] = recode[i]
    # Remove from consideration in other dataset
    Other = Other[Other['otherNeighborhood'] != i]

Recoded drawings: 62


In [285]:
# Remove comments and empty, invalid, or inappropriate values.
# Includes neighborhoods in Nassau or Westchester Counties, and New Jersey State. 
exclude = ['','a','Durham NC','Stockholm','My neighborhood','Five Towns','Poop','Fleetwood',
           'manhasset','garden city park','Kearny','New Amsterdam','No Mans Land','Yonkers',
           'Elmont','Valley Stream','Malverne','Journal Square','Great neck','Edgewater',
           'Rebeca','Hoboken ','Im just curios','hi','Great Neck','Highland Park Minnesota',
           'Paulus Hook','Wanker','Where I want to live','Floral park','My House','Hoboken',
           'uaBBar','Hempstead','just want to see map','Rockville Centre','No name',
           'Downtown Jersey City','dick butt','Dickbutt','Harsimus Cove','Crestwood','Los Angeles',
          'Floral Park','Carteret','Cedarhurst','Great Neck ','Five towns','Remsen Village',
          'Floral park ','Oceanside','Bergen Square','West New York']

print "Deleted Drawings:",Other[Other['otherNeighborhood'].isin(exclude)].count()[0]
# Remove from Other Dataset
Other = Other[~Other['otherNeighborhood'].isin(exclude)]
# Remove from main dataset
ny_nhoods = ny_nhoods[(~((ny_nhoods['neighborhoodLive']=='Other') & (ny_nhoods['otherNeighborhood'].isin(exclude))))]

Deleted Drawings: 110


In [286]:
# Define Neighborhoods where sufficient drawings exist (5+)
# 1. Bloomingdale (8 drawings; Manhattan)
# 2. Glen Oaks (7 drawings; Queens)
# 3. Harlem (13 drawings; Manhattan)
# 4. Hollis Hills (14 drawings; Queens)
# 5. Madison (5 drawings; Manhattan)
# 6. Manhattanville (7 drawings; Manhattan)
# 7. Mapleton (6 drawings; Brooklyn)
# 8. South Harlem (10 drawings; Manhattan)
# 9. South Williamsburg (10 drawings; Brooklyn)
#10. South Village (5 drawings; Manhattan)
#11. Two Bridges (6 drawings; Manhattan)

new_nhood =  {'Bloomingdale':'Bloomingdale','Bloomingdale ':'Bloomingdale','Bloomingdale District':'Bloomingdale',
             'Glen Oaks':'Glen Oaks','Glen oaks':'Glen Oaks','glen oaks':'Glen Oaks',
             'Harlem':'Harlem','Harlem ':'Harlem','harlem':'Harlem',
             'Hollis Hiils':'Hollis Hills','Hollis Hills':'Hollis Hills','Hollis hills':'Hollis Hills','Madison':'Madison',
              'Manhattanville':'Manhattanville','manhattanville':'Manhattanville', 'Mapleton':'Mapleton',
              'South Harlem':'South Harlem','South Harlem ':'South Harlem','Sons south harlem':'South Harlem',
              'South harlem':'South Harlem','SoHa':'South Harlem','south harlem  Central Park north':'South Harlem',
              'South Williamsburg':'South Williamsburg','South Williamsburg ':'South Williamsburg',
              'south williamsburg':'South Williamsburg','Southside  Williamsburg':'South Williamsburg',
              'Southside ':'South Williamsburg','Los Sures':'South Williamsburg', 'South Village':'South Village',
              'The South Village':'South Village','Two Bridges':'Two Bridges','Two Bridges ':'Two Bridges'
             }

print "Drawings assigned to new neighborhoods:", Other[Other['otherNeighborhood'].isin(new_nhood.keys())].count()[0]

# Recode the 'nhood' field to reflect pre-existing 'Other' neighborhoods
for i in new_nhood.keys():
    # Recode in main dataset
    ny_nhoods.loc[ny_nhoods['otherNeighborhood']==i,'nhood'] = new_nhood[i]
    # Remove from consideration in other dataset
    Other = Other[Other['otherNeighborhood'] != i]


Drawings assigned to new neighborhoods: 90


In [287]:
# Now just remove the remaining 'Others' from the main dataset
ny_nhoods = ny_nhoods[ny_nhoods['nhood']!='Other']
print "Total rows:", ny_nhoods.count()[0]

Total rows: 40813


In [288]:
# Finally, remove preexisting neighborhoods with <5 drawings
remove = ['Charleston','Unionport','Chelsea (Staten Island)','Edenwald','Greenridge','Ocean Breeze',
          'Rochdale Village','Utopia','Bloomfield','Pomonok','Seaside']

ny_nhoods = ny_nhoods[~ny_nhoods['nhood'].isin(remove)]
print "Total rows:", ny_nhoods.count()[0]
#ny_nhoods.groupby('nhood').size().sort_values(ascending=False)

Total rows: 40784


In [290]:
# Finally export the analytical dataset
ny_nhoods = ny_nhoods.to_crs({'init': 'epsg:4326'})
ny_nhoods.to_file('NYC_Analysis_wgs84.geojson',driver='GeoJSON')