In [1]:
import pandas as pd
import re
from dotenv import load_dotenv
import requests
import os
from src.geopoint_mongo_functions import transformToGeoPoint
from src.geopoint_mongo_functions import geocode
from src.web_scraping_functions import getPage
import numpy as np
from pymongo import MongoClient

In [None]:
%%time
df = pd.read_csv('input/companies_sorted.csv', encoding = 'latin-1')

In [None]:
# dataset con mas de 7M de company
df.head(5)

In [None]:
# Remove row where all values are None- No hace falta
#df = df.dropna(how='all')

In [None]:
# me quedo solo con las empresas italianas
italy_companies = df[df.country == 'italy']

In [None]:
# Detect missing values. Return a boolean output
italy_companies.isnull().sum().sort_values(ascending=False).head(5)

In [None]:
italy_companies.head(5)

In [None]:
# miro los attributos localidad y decido quedarme con la ciudad de Milan
italy_companies['locality'].value_counts().head(5)

In [None]:
# Find the NaN values and change them
italy_companies['locality'] = italy_companies['locality'].fillna('No Values')
milan_companies = italy_companies[italy_companies['locality'].str.contains(pat = 'milan')]

In [None]:
milan_companies.head(5)

In [None]:
milan_companies['industry'].value_counts().head(5)

In [None]:
# https://api.foursquare.com/v2/venues/search?ll=40.744010080453,-73.985651532083&categoryId=4d4b7105d754a06374d81259&query=starbucks&client_id=your_client_id&client_secret=your_client_secret
# https://api.foursquare.com/v2/venues/search?ll=40.7,-74&client_id=CLIENT_ID&client_secret=CLIENT_SECRET&v=YYYYMMDD


In [None]:
load_dotenv()

# 1 - Design Companies in Milan. API foursquare

In [None]:
design_companies = milan_companies[milan_companies.industry == 'design']

In [None]:
design_companies.head(5)

In [None]:
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

In [None]:
%%time
# de las compañias de design me falta la direccíon y la geolocalizacíon.
# a traves de un bucle for y la API foursquare hago diferentes requests por cada compañia en la tabla anterior,
# con el fin de incontrár la informacíon que necesito.
# c_json = company_json.get('venues') es necesario para evitar un KeyError, ya que no siempre está presente
# esta llave en el dictionario

design_companies_location = []
for design_c in design_companies['name']:
    url = f'https://api.foursquare.com/v2/venues/search?near=milan&query={design_c}&client_id={client_id}&client_secret={client_secret}&v=20181101'
    res = requests.get(url)
    company_json = res.json()
    company_json = company_json['response']
    c_json = company_json.get('venues')
    if c_json:
        design_companies_location.append({'name': c_json[0]['name'], 'location': c_json[0]['location']})

In [None]:
design_companies_location
design_company_table = pd.DataFrame(design_companies_location)

In [None]:
design_company_table.head(5)

In [None]:
# separo la columna location en subcolumnas para recuperar latitude y longitude
geo_design_company = pd.DataFrame(design_company_table.location.apply(pd.Series))

In [None]:
# design companies table limpia lista para la transformacíon en GeoPoint
clean_design_company = pd.concat([design_company_table["name"], geo_design_company[['address','lat','lng']]], axis=1)
clean_design_company

In [None]:
# para poder hacer query en MongoDB se necesita este formato 
# <field>: { type: <GeoJSON type> , coordinates: <coordinates> }
# por eso llamo la funcion <transformToGeoPoint as tg> y creo una nueva columna en la tabla
clean_design_company["geopoint"] = clean_design_company.apply(tg, axis=1)
clean_design_company

In [None]:
# Export as json records to import with mongoimport 
clean_design_company.to_json("output/design_company.json",orient="records")
# mongoimport -d datamad0620 -c offices --jsonArray offices.json

# Starbucks from Selenium

In [2]:
# https://www.starbucks.it/store-locator/search/location/milano
# Web Scraping utilizando Selenium porque el solo BeautifulSoup no permite cargar toda la pagina.
# Voy a llamar la funcion <getPage> para enviar la requests a la pagina oficíal de Starbucks italia
soup = getPage("https://www.starbucks.it/store-locator/search/location/milano")

[WDM] - Current google-chrome version is 81.0.4044
[WDM] - Get LATEST driver version for 81.0.4044
[WDM] - Driver [/home/must4in3/.wdm/drivers/chromedriver/linux64/81.0.4044.138/chromedriver] found in cache


 


In [3]:
# utilizando los selectores consiguo sacar el nombre de cada Starbucks y la direccíon, y el zip me permite juntar
# las dos informacíones
name_starbucks = [x.text for x in soup.select(".store-name")]
address_starbucks = [' '.join(x.text.split()) for x in soup.select(".address")]
merge_starbucks = list(zip(name_starbucks, address_starbucks))

In [4]:
# con un bucle for creo el dictionario y la tabla de los Starbucks de Milano
dict_starbucks = []
for starbucks in merge_starbucks:
    dict_starbucks.append({'name': starbucks[0], 'location': starbucks[1]})
starbacks_table = pd.DataFrame(dict_starbucks)

In [5]:
starbacks_table

Unnamed: 0,name,location
0,Milan Roastery,Piazza Cordusio 3 Milan 20123
1,Starbucks Porta Romana,"Via Lentasio, 1 angolo Corso di Porta Romana M..."
2,Starbucks Durini,"Via Durini, 28 Milan 20121"
3,Starbucks Turati,"Via Filippo Turati, 25 Milano 20121"
4,Starbucks Garibaldi,"Corso Garibaldi, 118 Milano 20121"
5,Starbucks Vercelli,"Piazzale Francesco Baracca, 10 angolo Corso Ve..."
6,Starbucks Centrale,"Piazza Luigi di Savoia, 1 Milano 20124"
7,Starbucks Assago Milanofiori,Viale Milanofiori Centro Commerciale Milanofio...
8,Starbucks Malpensa,Aeroporto Milano Malpensa T1 - Partenze Ferno ...


In [6]:
%%time
# llamo la funcion geocode porque necesito añadir en la tabla las infos de lat y long
#documentación en los siguientes enlaces -> https://geojson.io/#map=16/40.3959/-3.7039 ; https://geojson.org/
geo_starbucks = []
geocode(starbacks_table['location'], geo_starbucks)

{'standard': {'stnumber': '3', 'addresst': 'Piazza Cordusio', 'postal': {}, 'region': 'IT', 'prov': 'IT', 'city': 'Milano', 'countryname': 'Italy', 'confidence': '0.90'}, 'longt': '9.18620', 'alt': {}, 'elevation': {}, 'latt': '45.46496'}
{'standard': {'addresst': 'Via Lentasio', 'city': 'Milano', 'prov': 'IT', 'countryname': 'Italy', 'postal': {}, 'confidence': '0.90'}, 'longt': '9.19167', 'alt': {}, 'elevation': {}, 'latt': '45.45844'}
{'standard': {'stnumber': '28', 'addresst': 'Via Durini', 'postal': '20122', 'region': 'IT', 'prov': 'IT', 'city': 'Milano', 'countryname': 'Italy', 'confidence': '0.9'}, 'longt': '9.19836', 'alt': {}, 'elevation': {}, 'latt': '45.46563'}
{'standard': {'stnumber': '18', 'addresst': 'Via Filippo Turati', 'postal': '20121', 'region': 'IT', 'prov': 'IT', 'city': 'Milano', 'countryname': 'Italy', 'confidence': '0.90'}, 'longt': '9.19490', 'alt': {}, 'elevation': {}, 'latt': '45.47478'}
{'standard': {'stnumber': '51', 'addresst': 'Corso Garibaldi', 'postal'

In [7]:
# con assign cada valor de la lista se va asignando a la nueva columna geopoint en la starbacks_table
starbacks_table.assign(geopoint=geo_starbucks)

Unnamed: 0,name,location,geopoint
0,Milan Roastery,Piazza Cordusio 3 Milan 20123,"{'type': 'Point', 'coordinates': [9.1862, 45.4..."
1,Starbucks Porta Romana,"Via Lentasio, 1 angolo Corso di Porta Romana M...","{'type': 'Point', 'coordinates': [9.19167, 45...."
2,Starbucks Durini,"Via Durini, 28 Milan 20121","{'type': 'Point', 'coordinates': [9.19836, 45...."
3,Starbucks Turati,"Via Filippo Turati, 25 Milano 20121","{'type': 'Point', 'coordinates': [9.1949, 45.4..."
4,Starbucks Garibaldi,"Corso Garibaldi, 118 Milano 20121","{'type': 'Point', 'coordinates': [9.18336, 45...."
5,Starbucks Vercelli,"Piazzale Francesco Baracca, 10 angolo Corso Ve...","{'type': 'Point', 'coordinates': [9.16487, 45...."
6,Starbucks Centrale,"Piazza Luigi di Savoia, 1 Milano 20124","{'type': 'Point', 'coordinates': [9.20784, 45...."
7,Starbucks Assago Milanofiori,Viale Milanofiori Centro Commerciale Milanofio...,"{'type': 'Point', 'coordinates': [9.15274, 45...."
8,Starbucks Malpensa,Aeroporto Milano Malpensa T1 - Partenze Ferno ...,"{'type': 'Point', 'coordinates': [9.11615, 45...."


In [None]:
# METODOLOGIA ALTERNATIVA

In [None]:
#client_id = os.getenv("CLIENT_ID")
#client_secret = os.getenv("CLIENT_SECRET")

In [None]:
## enviar una request. Si sale un numero que empieza con 2 todo bien, 
## o si no ver los errores de los gatos a ver que dicen!
#url = f'https://api.foursquare.com/v2/venues/search?near=milan&query=Starbucks&client_id={client_id}&client_secret={client_secret}&v=20181101'
#res = requests.get(url)
#starbucks= res.json()
## del requests seleciono solo la key venues dentro response
#starbucks['response']['venues']
## leer infos read_json leer documentacíon

In [None]:
## dataframe de los starbucks in Milan
#table_starbucks = pd.DataFrame((starbucks['response']['venues'])) #read json
#table_starbucks.head(3)

In [None]:
## elimino la unica row que no tenga Starbucks
#table_starbucks = table_starbucks[table_starbucks['name'].str.contains(pat = 'Starbucks')]
## separo la columna location en subcolumnas para recuperar latitude y longitude
#geo_starbucks = pd.DataFrame(table_starbucks.location.apply(pd.Series))

In [None]:
## starbucks table limpia lista para la transformacíon en GeoPoint
#clean_starbucks = pd.concat([table_starbucks["name"], geo_starbucks[['address','lat','lng']]], axis=1)
#clean_starbucks

In [None]:
## para poder hacer query en MongoDB se necesita este formato 
## <field>: { type: <GeoJSON type> , coordinates: <coordinates> }
## por eso llamo la funcion transformToGeoPoint
#clean_starbucks["geopoint"] = clean_starbucks.apply(transformToGeoPoint, axis=1)
#clean_starbucks

In [None]:
## Convert objectIds to string to be able to export as JSON 
#clean_starbucks["geopoint"] = clean_starbucks["geopoint"].apply(lambda e: str(e))
## Export as json records to import with mongoimport 
#clean_starbucks.to_json("output/starbucks.json",orient="records")
## mongoimport -d datamad0620 -c offices --jsonArray offices.json

# sitios para diversion

In [None]:
#https://dati.comune.milano.it/dataset/ds252-economia-locali-pubblico-spettacolo

In [None]:
dbName = "Italian_Companies"
mongodbURL = f"mongodb://localhost/{dbName}"
client = MongoClient(mongodbURL, connectTimeoutMS=2000, serverSelectionTimeoutMS=2000)
db = client.get_database()

In [None]:
cur = db.Italian_Companies.find({"insegna":{'$ne':None}},{"insegna":1,"_id":0, "LAT_WGS84":1, "LONG_WGS84":1})

In [None]:
data = list(cur)

In [None]:
sitios_para_diversion = pd.DataFrame(data)
sitios_para_diversion

In [None]:
## para poder hacer query en MongoDB se necesita este formato 
## <field>: { type: <GeoJSON type> , coordinates: <coordinates> }
## por eso llamo la funcion transformToGeoPoint
sitios_para_diversion["geopoint"] = sitios_para_diversion.apply(transformToGeoPoint, axis=1)

In [None]:
sitios_para_diversion