-
Notifications
You must be signed in to change notification settings - Fork 10
/
permanent_db_facilities.py
130 lines (106 loc) · 5.63 KB
/
permanent_db_facilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import numpy as np
import os
from pathlib import Path
import requests
import zipfile
def prepare_facilities(proxies={}):
"""
Downloads (if needed) the raw data from the INSEE Permanent database of facilities
(https://www.insee.fr/fr/statistiques/3568638?sommaire=3568656),
then creates one dataframe for
the malls
the shops
the schools
the administration facilities
the sport facilities
the care facilities
the show facilities
the museum
the restaurants
and computes the corresponding weight in each city for each of these facilities
Each dataframe has the following strcucture:
Index:
DEPCOM (str): geographic code of the city
Columns:
sink_volume (float): weight of the city in terms of the appropriate facilities
and writes these into parquet files.
"""
data_folder_path = Path(os.path.dirname(__file__)).parents[0] / "data/insee/facilities"
if data_folder_path.exists() is False:
os.makedirs(data_folder_path)
path = data_folder_path / "bpe20_ensemble_csv.zip"
# Download the raw survey data from the Insee "Base Permanente des équipements" if needed
if path.exists() is False:
# Download the zip file
r = requests.get(
url="https://www.insee.fr/fr/statistiques/fichier/3568629/bpe20_ensemble_csv.zip",
proxies=proxies
)
with open(path, "wb") as file:
file.write(r.content)
# Unzip the content
with zipfile.ZipFile(path, "r") as zip_ref:
zip_ref.extractall(data_folder_path)
# Informations about jobs and active population for each city
db_facilities = pd.read_csv(data_folder_path / "bpe20_ensemble.csv",
sep=';', usecols=['DEPCOM', 'TYPEQU', 'NB_EQUIP'],
dtype={'DEPCOM': str})
# Informations about the facilities (area, assumpted frequentation)
facilities_features = pd.read_excel(data_folder_path.parent / "esane/equipments_features.xlsx",
usecols=['insee_id', 'shop_area', 'frequentation', 'motive_id'],
dtype={'motive_id': str})
# Informations about the turnover of shop facilities
facilities_turnover = pd.read_excel(data_folder_path.parent / "esane/shops_turnover.xlsx",
usecols=['naf_id', 'turnover_psqm', 'turnover_per_shop'],
dtype={'naf_id': str})
# Table to convert from INSEE terminology to NAF terminology
insee_to_naf = pd.read_excel(data_folder_path.parent / "esane/equipments_insee_to_naf.xlsx",
usecols=['insee_id', 'naf_id'],
dtype=str)
# The turnover is used to determine the weights to attribute to the different shops
facilities_turnover = pd.merge(facilities_turnover, insee_to_naf, on='naf_id')
facilities_features = pd.merge(facilities_turnover, facilities_features, on='insee_id', how='right')
# If the area is given, use the turnover per square meter
mask = facilities_features['shop_area'].notna()
facilities_features.loc[mask, 'weight'] = facilities_features.loc[mask, 'shop_area']*facilities_features.loc[mask, 'turnover_psqm']
# If not, use the turnover per shop
mask = facilities_features['turnover_per_shop'].notna()
facilities_features.loc[mask, 'weight'] = facilities_features.loc[mask, 'turnover_per_shop']
# In the other cases, use the frequentation
mask = facilities_features['weight'].isna()
facilities_features.loc[mask, 'weight'] = facilities_features.loc[mask, 'frequentation']
facilities_features.rename(columns={'insee_id': 'TYPEQU'}, inplace=True)
facilities_features.drop(columns=['shop_area', 'naf_id', 'turnover_psqm', 'turnover_per_shop',
'frequentation'],
inplace=True)
# Create one dataframe for each motive
motives = facilities_features['motive_id'].unique()
motive_db = {}
for mot in motives :
# get the facilities corresponding to the motive
motive_facilities = facilities_features.loc[facilities_features['motive_id']==mot]
motive_facilities = pd.merge(db_facilities, motive_facilities, on='TYPEQU', how='inner')
motive_facilities['sink_volume'] = motive_facilities['weight']*motive_facilities['NB_EQUIP']
motive_facilities = motive_facilities.drop(columns=['weight', 'TYPEQU', 'NB_EQUIP'])
motive_facilities = motive_facilities.groupby('DEPCOM').sum()
motive_db[mot] = motive_facilities
db_shops = motive_db['2']
db_schools = motive_db['1.4']
db_admin = motive_db['4.1']
db_sport = motive_db['7.6']
db_care = motive_db['3.1']
db_show = motive_db['7.5']
db_museum = motive_db['7.4']
db_restaurant = motive_db['7.3']
# ------------------------------------------
# Write datasets to parquet files
db_shops.to_parquet(data_folder_path / "shops.parquet")
db_schools.to_parquet(data_folder_path / "schools.parquet")
db_admin.to_parquet(data_folder_path / "admin_facilities.parquet")
db_sport.to_parquet(data_folder_path / "sport_facilities.parquet")
db_care.to_parquet(data_folder_path / "care_facilities.parquet")
db_show.to_parquet(data_folder_path / "show_facilities.parquet")
db_museum.to_parquet(data_folder_path / "museum.parquet")
db_restaurant.to_parquet(data_folder_path / "restaurants.parquet")
return