In [1]:
#imports
from geopy.geocoders import Nominatim
import pandas as pd
import numpy as np
import re

In [2]:
geolocator = Nominatim(user_agent="get_location")

location = "Morongo Valley"
location = geolocator.geocode(location)
latitude = location.latitude
longitude = location.longitude

print("Latitude:", latitude)
print("Longitude:", longitude)

Latitude: 34.0537585
Longitude: -116.59672363338927


In [3]:
def get_coordinate(query: str):
    """
    Get the latitude and longitude from an input query string
    """
    geolocator = Nominatim(user_agent="get_location")
 
    location = geolocator.geocode(query)
    if location is not None:
        latitude = location.latitude
        longitude = location.longitude
        return latitude, longitude
    else:
        return None

def partition_by_caps(input):
    parts = re.findall('[A-Z][^A-Z]*', input)
    return parts

In [4]:
# Extract variable name
file = open("communities.names") 
content = file.readlines()
variables_initial = content[75:203]
pat1 = r'\snumeric$'
pat2 = r'^@attribute\s'
pat3 = r'\sstring$'
combined_pat = r'|'.join((pat1, pat2, pat3))
variables = [re.sub(combined_pat, '', name.strip()) for name in variables_initial]

In [5]:
df = pd.read_csv('communities_no_title.csv', index_col=None, header=None)
df.columns = variables
df.to_csv('communities_with_title.csv', index=False)

In [6]:
df[df['communityname'].str[-4:]=='alle']['communityname']

1655    TwentyninePalms-MorongoValle
Name: communityname, dtype: object

In [7]:
sep = '\s*,\s*'

# Read CSV file using custom separator
state_code_df = pd.read_csv('us-state-code.csv', sep=sep, engine='python')
def get_state_name(code:int):
    return state_code_df.query('st=='+str(code))['stname'].to_string(index=False)

get_state_name(42)

'Pennsylvania'

In [8]:
df['state_name'] = df['state'].apply(get_state_name)

In [9]:
types = ['city', 'township', 'town', 'borough', 'village', 'division', 'district', 'Valle']
city_lst = df['communityname'].apply(partition_by_caps).to_list()

type_list = []
for city in city_lst:
    for t in types:
        if t in city[-1]:
            city[-1] = city[-1].removesuffix(t)
            type_list.append(t)
            removed = True
            break
        


    
df['type'] = type_list
df['city'] = city_lst

In [10]:
query_strs = []

for i in range(len(df['type'])):
    query_msg = ''
    for name in df['city'][i]:
        query_msg += name + ' '
    query_msg = query_msg[:-1]
    query_msg += ', '+df['state_name'][i]
    
    query_strs.append(query_msg)

In [11]:
# # this will take ~ 20 min
# latitudes = []
# longitudes = []
# for i in range(len(df['type'])):
#     try:
#         coor = get_coordinate(query_strs[i])
#         if coor is not None:
#             latitudes.append(coor[0])
#             longitudes.append(coor[1])
#         else:
#             latitudes.append(None)
#             longitudes.append(None)
#     except:
#         latitudes.append(None)
#         longitudes.append(None)

In [12]:
# pd.DataFrame({'latitude':latitudes,'longitude':longitudes}).to_csv('coordinates.csv')

In [13]:
# pd.read_csv('coordinates.csv')[['latitude', 'longitude']].to_csv('coordinates.csv')

In [14]:
coor = pd.read_csv('coordinates.csv')

In [15]:
coor.isna().sum()

Unnamed: 0     0
latitude      12
longitude     12
dtype: int64

In [16]:
df['latitude'] = coor['latitude']
df['longitude'] = coor['longitude']

In [17]:
cities = []
for i in range(len(df['city'])):
    city_str = ''
    for name in df['city'][i]:
        city_str += name + ' '
    city_str = city_str[:-1]
    
    cities.append(city_str)
df['city_name'] = cities

In [18]:
df = df.drop(columns=['state', 'county', 'community', 'city', 'communityname'], axis=1)
df = df.rename(columns={'ViolentCrimesPerPop': 'violent_crime_rate',
                       'state_name': 'state',
                       'city_name': 'area'})


first_cols = ['area','type','state', 'latitude', 'longitude']
last_cols = [col for col in df.columns if col not in first_cols]

df = df[first_cols+last_cols]

In [19]:
df.loc[224, 'area'] = "Fond Du Lac"
df.loc[707, 'area'] = "McAlester"
df.loc[791, 'area'] = "LaGrange"
df.loc[1130, 'area'] = "Eatontown"
df.loc[1194, 'area'] = "Middletown"
df.loc[1208, 'area'] = "City of Orange"
df.loc[1384, 'area'] = "Eatontown"
df.loc[1485, 'area'] = "Germantown"
df.loc[1655, 'area'] = "Twentynine Palms"
df.loc[1831, 'area'] = "DeLand"

In [20]:
df.loc[224, 'latitude'] = geolocator.geocode("Fond Du Lac, Wisconsin").latitude
df.loc[224, 'longitude'] = geolocator.geocode("Fond Du Lac, Wisconsin").longitude

df.loc[707, 'latitude'] = geolocator.geocode("McAlester, Oklahoma").latitude
df.loc[707, 'longitude'] = geolocator.geocode("McAlester, Oklahoma").longitude

df.loc[791, 'latitude'] = geolocator.geocode("LaGrange, Georgia").latitude
df.loc[791, 'longitude'] = geolocator.geocode("LaGrange, Georgia").longitude

df.loc[1060, 'latitude'] = geolocator.geocode("San Jose, CA").latitude
df.loc[1060, 'longitude'] = geolocator.geocode("San Jose, CA").longitude

df.loc[1130, 'latitude'] = geolocator.geocode("Eatontown, New Jersey").latitude
df.loc[1130, 'longitude'] = geolocator.geocode("Eatontown, New Jersey").longitude

df.loc[1194, 'latitude'] = geolocator.geocode("Middletown, New Jersey").latitude
df.loc[1194, 'longitude'] = geolocator.geocode("Middletown, New Jersey").longitude

df.loc[1208, 'latitude'] = geolocator.geocode("City of Orange, New Jersey").latitude
df.loc[1208, 'longitude'] = geolocator.geocode("City of Orange, New Jersey").longitude

df.loc[1384, 'latitude'] = geolocator.geocode("La Palma, California").latitude
df.loc[1384, 'longitude'] = geolocator.geocode("La Palma, California").longitude

df.loc[1485, 'latitude'] = geolocator.geocode("Germantown, Wisconsin").latitude
df.loc[1485, 'longitude'] = geolocator.geocode("Germantown, Wisconsin").longitude

df.loc[1655, 'latitude'] = geolocator.geocode("Twentynine Palms, California").latitude
df.loc[1655, 'longitude'] = geolocator.geocode("Twentynine Palms, California").longitude

df.loc[1831, 'latitude'] = geolocator.geocode("DeLand, Florida").latitude
df.loc[1831, 'longitude'] = geolocator.geocode("DeLand, Florida").longitude

df.loc[1962, 'latitude'] = geolocator.geocode("Santa Maria, CA").latitude
df.loc[1962, 'longitude'] = geolocator.geocode("Santa Maria, CA").longitude

In [21]:
df = df.sort_values('state')
df = df.replace('?', np.NaN)
df

Unnamed: 0,area,type,state,latitude,longitude,fold,population,householdsize,racepctblack,racePctWhite,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,violent_crime_rate
225,Sheffield,city,Alabama,34.765089,-87.698641,2,0.00,0.31,0.46,0.63,...,0.02,0.13,0.01,,,,,0.0,,0.10
32,Auburn,city,Alabama,32.609857,-85.480783,1,0.04,0.37,0.32,0.70,...,0.09,0.09,0.01,,,,,0.0,,0.15
1280,Daphne,city,Alabama,30.603525,-87.903605,7,0.00,0.39,0.31,0.75,...,0.03,0.08,0.01,,,,,0.0,,0.05
1827,Hoover,city,Alabama,33.350377,-86.834403,10,0.05,0.35,0.06,0.93,...,0.07,0.14,0.00,,,,,0.0,,0.09
961,Tuscaloosa,city,Alabama,33.209561,-87.567526,5,0.11,0.43,0.69,0.43,...,0.14,0.14,0.03,0.06,0.02,0.74,1,1.0,0.12,0.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,Sheridan,city,Wyoming,44.797194,-106.956179,8,0.01,0.29,0.00,0.97,...,0.02,0.15,0.00,,,,,0.0,,0.06
1024,Evanston,city,Wyoming,41.268246,-110.963758,6,0.00,0.65,0.00,0.96,...,0.03,0.09,0.11,,,,,0.0,,0.11
1730,Gillette,city,Wyoming,44.290635,-105.501876,9,0.01,0.53,0.00,0.96,...,0.04,0.11,0.11,,,,,0.0,,0.15
589,Laramie,city,Wyoming,41.311367,-105.591101,3,0.03,0.40,0.02,0.90,...,0.03,0.20,0.02,,,,,0.0,,0.12


In [22]:
df.to_csv('processed_communities.csv', index=False)