# Introduction/Business Problem
 

Stakeholder: A firm looking for new business oppurtunities anywhere in the US. 

Idea: The stakeholders believe businesses that are popular in NYC eventually become popular in other US cities.
The stakeholders asked us to identify popular businesses in NYC that have not yet caught on in other cities, so they can invest those businesses. 

Approach: 

1) Identify cities most similar to NYC in terms of businesses already open. The idea is that cities most similar to NYC will be more likely to embrance new businesses that are currently popular in NYC. 
          
2) From the cities most similar to NYC, identify businesses NYC businesses that are and are not in those cities. 
        -from these lists a city and business will be chosen based on the preferences of the stakeholders.
        
3) Identify a neighborhood in the chosen city that is most similar to a NYC neighborhood with high numbers of the chosen business

# Data


In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [3]:
from bs4 import BeautifulSoup
import requests

# get US cities and their latitudes and longitudes, from wiki

In [4]:
#List of cities from wiki
html_doc= "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
html= requests.get(html_doc).text
soup= BeautifulSoup(html,'lxml')
table = soup.find_all('table')[4]
data_list = pd.read_html(str(table))
df=pd.DataFrame(data_list[0])
df.columns = df.iloc[0]
df.drop([0], inplace = True)
#keep city name and coordinates
df=df.iloc[:,[1,10]]
df.columns = df.columns.fillna('Coordinates') 
df


Unnamed: 0,City,Coordinates
1,New York[d],40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W
2,Los Angeles,34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°W
3,Chicago,41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W
4,Houston[3],29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W
5,Phoenix,33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°W
6,Philadelphia[e],40°00′34″N 75°08′00″W﻿ / ﻿40.0094°N 75.1333°W
7,San Antonio,29°28′21″N 98°31′30″W﻿ / ﻿29.4724°N 98.5251°W
8,San Diego,32°48′55″N 117°08′06″W﻿ / ﻿32.8153°N 117.1350°W
9,Dallas,32°47′36″N 96°45′59″W﻿ / ﻿32.7933°N 96.7665°W
10,San Jose,37°17′48″N 121°49′08″W﻿ / ﻿37.2967°N 121.8189°W


In [5]:
#Clean coordinates and create lat and long columns
df['Coords'] = df['Coordinates'].str.split('/ ').str[1]
df['Latitude']=df['Coords'].str.split('°N ').str[0]
df['Longitude']=df['Coords'].str.split('°N ').str[1]
df['Longitude']=df['Longitude'].str.split('°W').str[0]
df['Longitude'] = df['Longitude'].astype(float)
#There was some weird character(s) in front of latitude that prevented it from being converted into string
#So line of code below eliminates it
df['Latitude']=df['Latitude'].str[1:]
df['Latitude'] = df['Latitude'].astype(float)
#Longitude from wiki was positive but denoted as west. So converted to proper negative values
df['Longitude'] = df['Longitude']*-1
cities=df[['City', 'Latitude', 'Longitude']]
cities=cities.head(100)
cities

Unnamed: 0,City,Latitude,Longitude
1,New York[d],40.6635,-73.9387
2,Los Angeles,34.0194,-118.4108
3,Chicago,41.8376,-87.6818
4,Houston[3],29.7866,-95.3909
5,Phoenix,33.5722,-112.0901
6,Philadelphia[e],40.0094,-75.1333
7,San Antonio,29.4724,-98.5251
8,San Diego,32.8153,-117.135
9,Dallas,32.7933,-96.7665
10,San Jose,37.2967,-121.8189


In [6]:
# create map of US using latitude and longitude values
latitude=40.6635
longitude=-115.9387
map_US = folium.Map(location=[latitude, longitude], zoom_start=3.4)

# add markers to map
for lat, lng, city in zip(cities['Latitude'], cities['Longitude'], cities['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_US)  
    
map_US

In [7]:
#name variables to use in 4square url
CLIENT_ID = 'O2WP1ZDLIE2MELGB1CCKTMKCQCXTPYB2LUOZKN3EFEV1MOW5' # your Foursquare ID
CLIENT_SECRET = 'RAL1PSZQSDGDJ5S43KIUXZ0YDHXJP05XK0GMXLOUQJ0RTNC0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: O2WP1ZDLIE2MELGB1CCKTMKCQCXTPYB2LUOZKN3EFEV1MOW5
CLIENT_SECRET:RAL1PSZQSDGDJ5S43KIUXZ0YDHXJP05XK0GMXLOUQJ0RTNC0


# get top 10 most common venues for each city

In [8]:
#creat a function to explore venues in a neighborhood and return a dataframe with venue and category
radius = 50000 
LIMIT=100
def getNearbyVenues(names, latitudes, longitudes, radius=100000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
#call the function on our neighborhoods using their lats and longs
US_venues = getNearbyVenues(names=cities['City'],
                                   latitudes=cities['Latitude'],
                                   longitudes=cities['Longitude'])

New York[d]
Los Angeles
Chicago
Houston[3]
Phoenix
Philadelphia[e]
San Antonio
San Diego
Dallas
San Jose
Austin
Jacksonville[f]
Fort Worth
Columbus
San Francisco[g]
Charlotte
Indianapolis[h]
Seattle
Denver[i]
Washington[j]
Boston
El Paso
Detroit
Nashville[k]
Portland
Memphis
Oklahoma City
Las Vegas
Louisville[l]
Baltimore[m]
Milwaukee
Albuquerque
Tucson
Fresno
Mesa
Sacramento
Atlanta
Kansas City
Colorado Springs
Miami
Raleigh
Omaha
Long Beach
Virginia Beach[m]
Oakland
Minneapolis
Tulsa
Arlington
Tampa
New Orleans[n]
Wichita
Cleveland
Bakersfield
Aurora
Anaheim
Honolulu[b]
Santa Ana
Riverside
Corpus Christi
Lexington[o]
Stockton
Henderson
Saint Paul
St. Louis[m]
Cincinnati
Pittsburgh
Greensboro
Anchorage[p]
Plano
Lincoln
Orlando
Irvine
Newark
Toledo
Durham
Chula Vista
Fort Wayne
Jersey City
St. Petersburg
Laredo
Madison
Chandler
Buffalo
Lubbock
Scottsdale
Reno
Glendale
Gilbert[q]
Winston–Salem
North Las Vegas
Norfolk[m]
Chesapeake[m]
Garland
Irving
Hialeah
Fremont
Boise[r]
Richmond[m]
B

In [10]:
#call shape to tell us how many venues there are
print(US_venues.shape)
#determine how many unique venue categories there are
print('There are {} uniques categories.'.format(len(US_venues['Venue Category'].unique())))
US_venues.head()

(10000, 7)
There are 353 uniques categories.


Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York[d],40.6635,-73.9387,Covenhoven,40.675143,-73.960203,Beer Bar
1,New York[d],40.6635,-73.9387,Kings Theatre,40.64611,-73.957175,Theater
2,New York[d],40.6635,-73.9387,Brooklyn Botanic Garden,40.667622,-73.963191,Botanical Garden
3,New York[d],40.6635,-73.9387,Prospect Park,40.661971,-73.971226,Park
4,New York[d],40.6635,-73.9387,Long Meadow,40.668758,-73.970304,Field


In [11]:
# one hot encoding
US_onehot = pd.get_dummies(US_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
US_onehot['City'] = US_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [US_onehot.columns[-1]] + list(US_onehot.columns[:-1])
US_onehot = US_onehot[fixed_columns]

In [12]:
US_onehot.head()
#US_onehot.shape

Unnamed: 0,City,Accessories Store,Adult Boutique,Afghan Restaurant,Airport Service,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Bagel Shop,Bakery,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Bavarian Restaurant,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Board Shop,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop,Bridge,Buddhist Temple,Buffet,Building,Burger Joint,Burrito Place,Business Service,Butcher,Café,Cajun / Creole Restaurant,Canal,Candy Store,Capitol Building,Caribbean Restaurant,Casino,Cemetery,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Churrascaria,Climbing Gym,Clothing Store,Club House,Cocktail Bar,Coffee Shop,College Academic Building,College Arts Building,College Auditorium,College Baseball Diamond,College Basketball Court,College Bookstore,College Football Field,College Gym,College Quad,College Rec Center,College Residence Hall,College Theater,Comedy Club,Comfort Food Restaurant,Comic Shop,Community Center,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cuban Restaurant,Cupcake Shop,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distillery,Dive Bar,Dive Spot,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Ethiopian Restaurant,Event Space,Fabric Shop,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Film Studio,Fish Market,Fishing Spot,Flea Market,Flower Shop,Fondue Restaurant,Food,Food & Drink Shop,Food Court,Food Truck,Football Stadium,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,German Restaurant,Gift Shop,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gun Range,Gun Shop,Gym,Gym / Fitness Center,Gymnastics Gym,Harbor / Marina,Hardware Store,Hawaiian Restaurant,Health & Beauty Service,Herbs & Spices Store,Hill,Historic Site,History Museum,Hobby Shop,Hockey Arena,Hostel,Hot Dog Joint,Hotel,Hotel Bar,Hotel Pool,Hunan Restaurant,Hunting Supply,IT Services,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indie Theater,Irish Pub,Island,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Kids Store,Kitchen Supply Store,Korean Restaurant,Lake,Latin American Restaurant,Laundromat,Leather Goods Store,Lebanese Restaurant,Library,Lingerie Store,Liquor Store,Lounge,Malay Restaurant,Marijuana Dispensary,Market,Martial Arts Dojo,Massage Studio,Mediterranean Restaurant,Memorial Site,Men's Store,Mexican Restaurant,Meze Restaurant,Middle Eastern Restaurant,Mini Golf,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Motorcycle Shop,Mountain,Movie Theater,Multiplex,Museum,Music Store,Music Venue,Nail Salon,National Park,Nature Preserve,Neighborhood,New American Restaurant,Nightclub,Noodle House,Observatory,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Other Nightlife,Outdoor Sculpture,Outdoor Supply Store,Park,Pawn Shop,Pedestrian Plaza,Pelmeni House,Performing Arts Venue,Peruvian Restaurant,Pet Store,Pharmacy,Photography Studio,Pie Shop,Pier,Piercing Parlor,Pilates Studio,Pizza Place,Planetarium,Playground,Plaza,Poke Place,Pool,Portuguese Restaurant,Pub,Public Art,Racecourse,Racetrack,Radio Station,Ramen Restaurant,Record Shop,Recreation Center,Reservoir,Residential Building (Apartment / Condo),Resort,Restaurant,River,Road,Rock Club,Romanian Restaurant,Roof Deck,Russian Restaurant,Salad Place,Salon / Barbershop,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,School,Science Museum,Sculpture Garden,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Skate Park,Ski Area,Ski Chairlift,Smoke Shop,Snack Place,Soccer Field,Soccer Stadium,Social Club,Soup Place,South American Restaurant,Southern / Soul Food Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Stadium,State / Provincial Park,Steakhouse,Street Food Gathering,Summer Camp,Supermarket,Surf Spot,Sushi Restaurant,Szechuan Restaurant,Taco Place,Tanning Salon,Tapas Restaurant,Tattoo Parlor,Tea Room,Temple,Tex-Mex Restaurant,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Thrift / Vintage Store,Tiki Bar,Tour Provider,Tourist Information Center,Toy / Game Store,Track,Trail,Train Station,Truck Stop,Udon Restaurant,University,Used Bookstore,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Vineyard,Volcano,Warehouse Store,Water Park,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,New York[d],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,New York[d],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,New York[d],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,New York[d],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,New York[d],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
US_grouped = US_onehot.groupby('City').mean().reset_index()

US_grouped.shape

(100, 354)

In [14]:
#creat a function to return the top venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [15]:
#call function to find top 10 venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
City_venues_sorted = pd.DataFrame(columns=columns)
City_venues_sorted['City'] = US_grouped['City']

for ind in np.arange(US_grouped.shape[0]):
    City_venues_sorted.iloc[ind, 1:] = return_most_common_venues(US_grouped.iloc[ind, :], num_top_venues)

City_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Albuquerque,Pizza Place,Brewery,American Restaurant,Mexican Restaurant,Grocery Store,Coffee Shop,Café,BBQ Joint,Bookstore,Steakhouse
1,Anaheim,Beach,Theme Park,Theme Park Ride / Attraction,Grocery Store,Park,Garden,Brewery,Italian Restaurant,Art Gallery,Ice Cream Shop
2,Anchorage[p],Brewery,Mexican Restaurant,Coffee Shop,Movie Theater,Restaurant,Park,Seafood Restaurant,American Restaurant,Bakery,Pizza Place
3,Arlington,Grocery Store,Brewery,Art Museum,Hotel,Coffee Shop,Seafood Restaurant,Music Venue,Fast Food Restaurant,Gourmet Shop,Burger Joint
4,Atlanta,Trail,Park,Brewery,Grocery Store,Mediterranean Restaurant,Ice Cream Shop,American Restaurant,Pizza Place,Beer Store,Market


# for the final project I will use k means clustering to find the cities most similar to NYC. Once a business and city are chosen I will use k means clustering again to find a suitable neighborhood in the new city to start the new business