# Capstone Project - Preliminary Research

## First insert your Foursquare API credentials, please:

In [None]:
CLIENT_ID = 'XXXXX'
CLIENT_SECRET = 'XXXXX'
VERSION = 'XXXXX'

## Installing the (possibly) missing libraries, if you need:

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes

In [None]:
!conda install -c conda-forge geopy --yes

## Importing all the libraries at one time:

In [None]:
import folium

from geopy.geocoders import Nominatim

import json

import matplotlib.cm as cm
import matplotlib.colors as colors

import numpy as np

import pandas as pd
from pandas.io.json import json_normalize

import requests

from sklearn.cluster import KMeans

print('Libraries imported!')

## Researching the stations data:

In [None]:
dictionaryStations = {'Station': [
    'Ana Rosa',
    'Brigadeiro',
    'Chacara Klabin',
    'Clinicas',
    'Ipiranga',
    'Jabaquara',
    'Japao Liberdade',
    'Juventus Mooca',
    'Marechal Deodoro',
    'Paraiso',
    'Republica',
    'Santos Imigrantes',
    'Trianon MASP',
    'Tucuruvi',
    'Vila Madalena'
]}

dfStations = pd.DataFrame(dictionaryStations)

dfStations['Latitude'] = pd.Series()
dfStations['Longitude'] = pd.Series()

In [None]:
geolocator = Nominatim(user_agent = "myExplorer")

for i in range(dfStations.shape[0]):

    address = 'Estacao ' + dfStations.iloc[i, 0] + ' Sao Paulo Brasil'
    location = geolocator.geocode(address)
 
    if location is not None:
        
        dfStations.iloc[i, 1] = location.latitude
        dfStations.iloc[i, 2] = location.longitude
       
dfStations

## Locating the CENTER of Sao Paulo:

In [None]:
locationCenter = geolocator.geocode('Sao Paulo')
latitudeCenter = locationCenter.latitude
longitudeCenter = locationCenter.longitude

print("Latitude: {:.2f}°.".format(latitudeCenter))
print("Longitude: {:.2f}°.".format(longitudeCenter))

# Creating the stations map:

In [None]:
mapSaoPaulo = folium.Map(location = [latitudeCenter, longitudeCenter], zoom_start = 12, min_zoom = 10, max_zoom = 14)

# Plottng the Postal Codes:

for station, latitude, longitude in zip(
    dfStations['Station'], 
    dfStations['Latitude'], 
    dfStations['Longitude']):
    
    stationLabel = station
    stationPopup = folium.Popup(stationLabel, parse_html = True)
    
    marker = folium.CircleMarker(
        [latitude, longitude],
        radius = 5,
        popup = stationPopup,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False)
    
    marker.add_to(mapSaoPaulo)
    
# Showing the map:
    
mapSaoPaulo

### Recycled function to get the venues by location using the Foursquare API:

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500, limit = 100):
    
    venues_list = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={''},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)

        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Location', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Using the declared function getNearbyVenues to search for nearby venues:

In [None]:
dfVenues = getNearbyVenues(
    names = dfStations['Station'], 
    latitudes = dfStations['Latitude'], 
    longitudes = dfStations['Longitude'], 
    radius = 500
)

In [None]:
dfVenues.groupby('Location')['Venue'].count()

In [None]:
dfVenuesOnehot = pd.get_dummies(dfVenues[['Venue Category']], prefix = "", prefix_sep = "")
dfVenuesOnehot['Location'] = dfVenues['Location'] 
fixedColumns = [dfVenuesOnehot.columns[-1]] + list(dfVenuesOnehot.columns[:-1])
dfVenuesOnehot = dfVenuesOnehot[fixedColumns]
dfVenuesOnehot = dfVenuesOnehot.groupby('Location').mean().reset_index()

In [None]:
numTopVenues = 5

print('============================================================\n')

for location in dfVenuesOnehot['Location']:
    print("Station: " + location + '\n')
    temp = dfVenuesOnehot[dfVenuesOnehot['Location'] == location].T.reset_index()
    temp.columns = ['Category', 'Frequency']
    temp = temp.iloc[1:]
    temp['Frequency'] = temp['Frequency'].astype(float)
    temp = temp.round({'Frequency': 2})
    print(temp.sort_values('Frequency', ascending = False).reset_index(drop = True).head(numTopVenues))
    print('\n============================================================\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0 : num_top_venues]

In [None]:
numTopVenues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Location']

for ind in np.arange(numTopVenues):
    
    n = (ind + 1)
    
    try:
    
        columns.append('{}{} Most Common Venue'.format(n, indicators[ind]))
    
    except:
        
        columns.append('{}th Most Common Venue'.format(n))

dfVenuesOnehotSorted = pd.DataFrame(columns = columns)
dfVenuesOnehotSorted['Location'] = dfVenuesOnehot['Location']

for ind in np.arange(dfVenuesOnehot.shape[0]):
    
    dfVenuesOnehotSorted.iloc[ind, 1:] = return_most_common_venues(dfVenuesOnehot.iloc[ind, :], numTopVenues)

# Using the K-means algorithm:

In [None]:
k = 5

dfVenuesClustering = dfVenuesOnehot.drop('Location', 1)

kmeans = KMeans(n_clusters = k, random_state = 0).fit(dfVenuesClustering)

print('Generated KMeans labels:\n')
print(kmeans.labels_)

In [None]:
dfStationsMerged = dfStations.copy()

dfStationsMerged.insert(3, 'Cluster Label', kmeans.labels_)

dfStationsMerged = dfStationsMerged.join(dfVenuesOnehotSorted.set_index('Location'), on = 'Station')

dfStationsMerged

In [None]:
# Creating the map:

mapClusters = folium.Map(location = [latitudeCenter, longitudeCenter], zoom_start = 11, min_zoom = 10, max_zoom = 14)

# Defining the clusters colors:

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Plotting the clusters:

for location, cluster, latitude, longitude in zip(
    dfStationsMerged['Station'],
    dfStationsMerged['Cluster Label'],
    dfStationsMerged['Latitude'], 
    dfStationsMerged['Longitude']):
    
    stationLabel = '{} - Cluster {}'.format(location, cluster)
    stationPopup = folium.Popup(stationLabel, parse_html = True)
    
    marker = folium.CircleMarker(
        [latitude, longitude],
        radius = 5,
        popup = stationPopup,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7)
    
    marker.add_to(mapClusters)
    
# Showing the map:
       
mapClusters

In [None]:
dfStationsMerged[dfStationsMerged['Cluster Label'] == 0]

In [None]:
dfStationsMerged[dfStationsMerged['Cluster Label'] == 1]

In [None]:
dfStationsMerged[dfStationsMerged['Cluster Label'] == 2]

In [None]:
dfStationsMerged[dfStationsMerged['Cluster Label'] == 3]

In [None]:
dfStationsMerged[dfStationsMerged['Cluster Label'] == 4]