# Exploration Géolocalisation

Il s'agit pendant cette exploration de trouver une distance qui permet de remonter des offres diversifiées à un utilisateur de l'application pass Culture.
Les critères de diversifications sont : 
- Ne pas assigner à résidence (une distance minimale)
- Pas d'offres lointaines ou l'utilisateur ne peut se rendre
- Un nombre de catégories diversifiées

Pour cela, nous allons faire un ensemble de tests sur un jeu de données bien précis afin d'affiner les critères de distance et de nombre d'offres. Nous avons créé une table qui s'appelle 'iris_france' qui contient toutes les données IRIS des points de la France (métropole + Guyane pour l'instant)

## Connexion à la base de données créée

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import geopandas as gpd
from shapely.geometry import Point

In [None]:
%matplotlib inline

In [None]:
db_url = os.environ.get('POSTGRES_URL')

In [None]:
# create sqlalchemy engine
engine = create_engine(db_url)

## Table des IRIS

In [None]:
def get_num_offers(engine, lon, lat, dist):
    venues_query = '''WITH reference_point AS ( 
       SELECT ST_CENTROID(shape) AS centroid 
       FROM iris_france 
       WHERE ST_CONTAINS(shape, ST_SetSRID(ST_MakePoint({}, {}), 4326))) 
       SELECT venue.id FROM venue, reference_point WHERE ST_DISTANCE(centroid, CAST(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS GEOGRAPHY)) < {};'''.format(lon,lat,dist)
    
    venues_ids = []
    
    def get_venues_id(query, engine):
        venues_list = list(pd.read_sql_query(query, engine)['id'])
        
        if len(venues_list) > 0:
            return venues_list
        
        return []
    
    venues_ids = get_venues_id(venues_query, engine)
    
    if venues_ids:
        if len(venues_ids) == 1:
            return int(pd.read_sql_query('''SELECT COUNT(*) FROM discovery_view WHERE "venueId" = {};'''.format(venues_ids[0]),engine).iloc[0])
        else:
            return int(pd.read_sql_query('''SELECT COUNT(*) FROM discovery_view WHERE "venueId" IN {};'''.format(tuple(venues_ids)),engine).iloc[0])
    else:
        return 0

In [None]:
def get_num_categories(engine, lon, lat, dist):
    venues_query = '''WITH reference_point AS ( 
       SELECT ST_CENTROID(shape) AS centroid 
       FROM iris_france 
       WHERE ST_CONTAINS(shape, ST_SetSRID(ST_MakePoint({}, {}), 4326))) 
       SELECT venue.id FROM venue, reference_point WHERE ST_DISTANCE(centroid, CAST(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS GEOGRAPHY)) < {};'''.format(lon,lat,dist)
    
    venues_ids = []
    
    def get_venues_id(query, engine):
        venues_list = list(pd.read_sql_query(query, engine)['id'])
        if len(venues_list) > 0:
            return venues_list
        return []
    
    venues_ids = get_venues_id(venues_query, engine)
    
    if venues_ids:
        if len(venues_ids) == 1:
            return int(pd.read_sql_query('''SELECT COUNT(DISTINCT(type)) FROM discovery_view WHERE "venueId" = {};'''.format(venues_ids[0]),engine).iloc[0])
        else:
            return int(pd.read_sql_query('''SELECT COUNT(DISTINCT(type)) FROM discovery_view WHERE "venueId" IN {};'''.format(tuple(venues_ids)),engine).iloc[0])
    else:
        return 0

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def plot_df(df, x, y1, y2):
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.set_xlabel(x)
    ax1.set_ylabel(y1, color=color)
    ax1.plot(df[x], df[y1], color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel(y2, color=color)  # we already handled the x-label with ax1
    ax2.plot(df[x], df[y2], color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()

## Paris (longitude=2.351837, latitude=48.863615)

In [None]:
longitude=2.351837
latitude=48.863615

In [None]:
paris_df = pd.DataFrame({'distance_ref' : list(range(1000, 20000, 1000))})

In [None]:
paris_df['longitude'] = longitude
paris_df['latitude'] = latitude
paris_df['lieu'] = 'paris'

In [None]:
paris_df.shape

In [None]:
paris_gdf = gpd.GeoDataFrame(paris_df, geometry=gpd.points_from_xy(paris_df.longitude, paris_df.latitude))
paris_gdf.crs = {'init': 'epsg:4326'}

### Nombre d'offres

In [None]:
paris_df['nombre_offres'] = paris_df.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

### Nombre de catégories

In [None]:
paris_df['nombre_categories'] = paris_df.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(paris_df, 'distance_ref', 'nombre_offres', 'nombre_categories')

## Sarcelles (longitude=2.384995, latitude=48.990277)

In [None]:
longitude=2.384995
latitude=48.990277

In [None]:
sarcelles_df = pd.DataFrame({'distance_ref' : list(range(5000, 40000, 1000))})

In [None]:
sarcelles_df['longitude'] = longitude
sarcelles_df['latitude'] = latitude
sarcelles_df['lieu'] = 'sarcelles'

In [None]:
sarcelles_gdf = gpd.GeoDataFrame(sarcelles_df, geometry=gpd.points_from_xy(sarcelles_df.longitude, sarcelles_df.latitude))
sarcelles_gdf.crs = {'init': 'epsg:4326'}

In [None]:
sarcelles_df.head()

## Nombre d'offres

In [None]:
sarcelles_df['nombre_offres'] = sarcelles_df.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

## Nombre de catégories

In [None]:
sarcelles_df['nombre_categories'] = sarcelles_df.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(sarcelles_df, 'distance_ref', 'nombre_offres', 'nombre_categories')

## Aulnay (longitude=2.5167, latitude=48.95)

In [None]:
longitude=2.5167
latitude=48.95

In [None]:
aulnay_df = pd.DataFrame({'distance_ref' : list(range(1000, 30000, 1000))})

In [None]:
aulnay_df['longitude'] = longitude
aulnay_df['latitude'] = latitude
aulnay_df['lieu'] = 'sarcelles'
aulnay_gdf = gpd.GeoDataFrame(aulnay_df, geometry=gpd.points_from_xy(aulnay_df.longitude, aulnay_df.latitude))
aulnay_gdf.crs = {'init': 'epsg:4326'}

## Nombre d'offres

In [None]:
aulnay_df['nombre_offres'] = aulnay_df.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

## Nombre de catégories

In [None]:
aulnay_df['nombre_categories'] = aulnay_df.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(aulnay_df, 'distance_ref', 'nombre_offres', 'nombre_categories')

## Bretagne centre

In [None]:
longitude=-2.614613
latitude=48.162784

In [None]:
bretagne_df = pd.DataFrame({'distance_ref' : list(range(5000, 80000, 1000))})
bretagne_df['longitude'] = longitude
bretagne_df['latitude'] = latitude
bretagne_df['lieu'] = 'sarcelles'
bretagne_gdf = gpd.GeoDataFrame(bretagne_df, geometry=gpd.points_from_xy(bretagne_df.longitude, bretagne_df.latitude))
bretagne_gdf.crs = {'init': 'epsg:4326'}
bretagne_df['nombre_offres'] = bretagne_df.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)
bretagne_df['nombre_categories'] = bretagne_df.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(bretagne_df, 'distance_ref', 'nombre_offres', 'nombre_categories')

In [None]:
bretagne_df.head(10)

## Quimper

In [None]:
longitude=-4.107512
latitude=47.977196

In [None]:
quimper_df = pd.DataFrame({'distance_ref' : list(range(5000, 100000, 1000))})
quimper_df['longitude'] = longitude
quimper_df['latitude'] = latitude
quimper_df['lieu'] = 'sarcelles'
quimper_df = gpd.GeoDataFrame(quimper_df, geometry=gpd.points_from_xy(quimper_df.longitude, quimper_df.latitude))
quimper_df.crs = {'init': 'epsg:4326'}
quimper_df['nombre_offres'] = quimper_df.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)
quimper_df['nombre_categories'] = quimper_df.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(quimper_df, 'distance_ref', 'nombre_offres', 'nombre_categories')

## Près de Charleville-Mézières

In [None]:
longitude=4.830971
latitude=49.881626

In [None]:
pres_de_charlesville_mezieres = pd.DataFrame({'distance_ref' : list(range(5000, 100000, 1000))})
pres_de_charlesville_mezieres['longitude'] = longitude
pres_de_charlesville_mezieres['latitude'] = latitude
pres_de_charlesville_mezieres['lieu'] = 'sarcelles'
pres_de_charlesville_mezieres = gpd.GeoDataFrame(pres_de_charlesville_mezieres, geometry=gpd.points_from_xy(pres_de_charlesville_mezieres.longitude, pres_de_charlesville_mezieres.latitude))
pres_de_charlesville_mezieres.crs = {'init': 'epsg:4326'}
pres_de_charlesville_mezieres['nombre_offres'] = pres_de_charlesville_mezieres.apply(lambda row : get_num_offers(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)
pres_de_charlesville_mezieres['nombre_categories'] = pres_de_charlesville_mezieres.apply(lambda row : get_num_categories(engine, row['longitude'], row['latitude'], row['distance_ref']), axis=1)

In [None]:
plot_df(pres_de_charlesville_mezieres, 'distance_ref', 'nombre_offres', 'nombre_categories')