In [22]:
# import of libraries 
from sqlalchemy import create_engine, text
import pandas as pd
import plotly.express as px
import psycopg2
import os
from datetime import datetime
import json

In [23]:
# Call of our date in S3 
weather = pd.read_csv('s3://booking-scapping/city_weather_by_day.csv')
hotel = pd.read_csv('s3://booking-scapping/hotels_info.csv')

In [24]:
# Keep usefull data before sending it to our DB : take the weather by day of the top 5 city 

top_5_city_name = pd.read_csv('data/top_5_city_name.csv')

city_weather_top_5 = weather[weather['city'].isin(top_5_city_name['city'])]
city_weather_top_5.head()

Unnamed: 0.1,Unnamed: 0,city,dt_object,main_weather,prepcipitation,temperature,latitude,longitude
12,12,Amiens,2023-02-20,Clouds,0.0,8.785,49.894171,2.295695
13,13,Amiens,2023-02-21,Clouds,0.0,8.70125,49.894171,2.295695
14,14,Amiens,2023-02-22,['Clouds' 'Rain'],1.83,8.62125,49.894171,2.295695
15,15,Amiens,2023-02-23,Clouds,0.23,5.94375,49.894171,2.295695
16,16,Amiens,2023-02-24,Clouds,0.0,5.885,49.894171,2.295695


In [25]:
# Clean the scrapping data before sending it to our DB

hotel[['lat', 'lon']] = hotel['latlng'].str.split(',', 1, expand=True)
hotel = hotel.drop(['depth','download_timeout','download_slot','latlng','download_latency'], axis=1)

hotel['Score'] = hotel['Score'].replace({ ',' : '.'}, regex=True)

hotel['Score'] = hotel['Score'].astype(float)
hotel['lat'] = hotel['lat'].astype(float)
hotel['lon'] = hotel['lon'].astype(float)


In a future version of pandas all arguments of StringMethods.split except for the argument 'pat' will be keyword-only.



In [26]:
# gets our secrets for connection at RDS

import json
with open("./secrets.json") as f:
    secrets = json.load(f)


In [27]:
# send to our DB

#engine = create_engine("sqlite:///:memory:", echo=True)
engine = create_engine(f'postgresql+psycopg2://{secrets["DBUSER"]}:{secrets["DBPASS"]}@{secrets["DBHOST"]}', echo=True)

city_weather_top_5.to_sql(
    f"city_weather_top_5",
    engine,
    if_exists='replace'
)

hotel.to_sql(
    "hotel",
    engine,
    if_exists='replace'
)

2023-02-20 09:17:15,323 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-02-20 09:17:15,324 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-02-20 09:17:15,498 INFO sqlalchemy.engine.Engine select current_schema()
2023-02-20 09:17:15,499 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-02-20 09:17:15,669 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-02-20 09:17:15,670 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-02-20 09:17:15,850 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-02-20 09:17:15,851 INFO sqlalchemy.engine.Engine [generated in 0.00070s] {'name': 'city_weather_top_5'}
2023-02-20 09:17:16,093 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-02-20 09:17:16,094 INFO sqlalchemy.engine.Engine [

100

## Call tables from our BD and do visualization 

In [28]:
# Weather table 

stmt = text("SELECT * FROM city_weather_top_5 "
            )

weather = pd.read_sql(
        stmt,
        engine
    )

weather.head()

2023-02-20 09:17:21,284 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-02-20 09:17:21,286 INFO sqlalchemy.engine.Engine [cached since 5.436s ago] {'name': 'SELECT * FROM city_weather_top_5 '}
2023-02-20 09:17:21,552 INFO sqlalchemy.engine.Engine SELECT * FROM city_weather_top_5 
2023-02-20 09:17:21,553 INFO sqlalchemy.engine.Engine [generated in 0.00087s] {}


Unnamed: 0.1,index,Unnamed: 0,city,dt_object,main_weather,prepcipitation,temperature,latitude,longitude
0,12,12,Amiens,2023-02-20,Clouds,0.0,8.785,49.894171,2.295695
1,13,13,Amiens,2023-02-21,Clouds,0.0,8.70125,49.894171,2.295695
2,14,14,Amiens,2023-02-22,['Clouds' 'Rain'],1.83,8.62125,49.894171,2.295695
3,15,15,Amiens,2023-02-23,Clouds,0.23,5.94375,49.894171,2.295695
4,16,16,Amiens,2023-02-24,Clouds,0.0,5.885,49.894171,2.295695


In [29]:
# Add a proportinal colum on temperature for better see the variation 
city_weather_top_5['temp_cube'] = city_weather_top_5['temperature']**3
city_weather_top_5.head()

city_weather_top_5['dt_object'] = city_weather_top_5['dt_object'].astype(str)

In [30]:
fig = px.scatter_mapbox(city_weather_top_5, lat="latitude", lon="longitude",
                        color="temperature", size ='temp_cube', zoom=7, mapbox_style="carto-positron",
                        animation_frame = "dt_object", animation_group = 'temperature',
                        title="Top 5 best weather cities")
fig.show()

In [31]:
# Hotel info
stmt = text("SELECT * FROM hotel "
            )

df = pd.read_sql(
        stmt,
        engine
    )

df.head()

2023-02-20 09:17:21,926 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-02-20 09:17:21,926 INFO sqlalchemy.engine.Engine [cached since 6.076s ago] {'name': 'SELECT * FROM hotel '}
2023-02-20 09:17:22,172 INFO sqlalchemy.engine.Engine SELECT * FROM hotel 
2023-02-20 09:17:22,174 INFO sqlalchemy.engine.Engine [generated in 0.00175s] {}


Unnamed: 0.1,index,Unnamed: 0,hotel name,Url to its booking.com page,Score,Text description,lat,lon
0,0,0,James Vignoble Hôtel - Eguisheim,https://www.booking.com/hotel/fr/saint-hubert-...,8.6,James Vignoble Hôtel - Eguisheim is located am...,48.044963,7.301578
1,1,1,WHITE HOUSE DHAVERNAS - PROCHE CENTRE - PARKIN...,https://www.booking.com/hotel/fr/white-house-d...,8.7,"Situé à Amiens, en Picardie, à proximité de la...",49.88147,2.299871
2,2,2,L'AMIE'NOIS - 6 COUCHAGES - JARDIN - WiFi,https://www.booking.com/hotel/fr/appt-en-amien...,8.0,"Situé à 1,9 km du Zénith d'Amiens et à 3,9 km ...",49.888011,2.264935
3,3,3,Au Coeur d'Amiens,https://www.booking.com/hotel/fr/maison-au-coe...,9.2,"Situé à Amiens, à seulement 1 km de la gare, l...",49.896155,2.306368
4,4,4,"Charmant T2 Hyper-centre, Netflix, Gare",https://www.booking.com/hotel/fr/charmant-t2-h...,9.0,"Le Charmant T2 Hyper-centre, Netflix, Gare est...",49.890137,2.298532


In [32]:
fig = px.scatter_mapbox(hotel[hotel['Score'].notna()], lat="lat", lon="lon", 
                        zoom=7, size="Score", color="Score", mapbox_style="carto-positron", 
                        title = 'Top 20 hotels in our top 5 cities')
fig.show()