In [1]:
import pandas as pd
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

PRIX = '/kaggle/input/france-fuel-prices/prix_carburants.csv'
df = pd.read_csv(filepath_or_buffer=PRIX, sep=';', parse_dates=['prix_maj'], index_col=['id'])
# we need to recover the lat/lon pairs in two columns because we want to use plotly
df['latitude'] = df['geom'].str.split(',').str[0].astype(float)
df['longitude'] = df['geom'].str.split(',').str[1].astype(float)
df['year'] = df['prix_maj'].dt.year
# we have one price from the past that is probably bogus
df = df[df['year'] > 2016]
df.head()

Unnamed: 0_level_0,cp,pop,adresse,ville,horaires,geom,prix_maj,prix_id,prix_valeur,prix_nom,...,dep_name,reg_code,reg_name,com_code,com_name,services_service,horaires_automate_24_24,latitude,longitude,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12270001,12270,R,ZA LE LAC,La Fouillade,,"44.240241001506, 2.0328320468272003",2024-01-29 10:47:14+00:00,5.0,1.779,E10,...,Aveyron,76.0,Occitanie,12105,La Fouillade,"Laverie,Station de gonflage,Piste poids lourds...",Non,44.240241,2.032832,2024.0
25115001,25115,R,Z.A. les Salines,Pouilley-les-Vignes,"{""@automate-24-24"": ""1"", ""jour"": [{""@id"": ""1"",...","47.258, 5.927",2024-01-27 10:38:10+00:00,2.0,1.819,SP95,...,Doubs,27.0,Bourgogne-Franche-Comté,25467,Pouilley-les-Vignes,"Station de gonflage,Piste poids lourds,Lavage ...",Oui,47.258,5.927,2024.0
80570001,80570,R,Rue Joliot Curie,Dargnies,"{""@automate-24-24"": """", ""jour"": [{""@id"": ""1"", ...","50.04475, 1.52562",2023-12-30 06:36:24+00:00,2.0,1.799,SP95,...,Somme,32.0,Hauts-de-France,80235,Dargnies,"Boutique alimentaire,Vente de gaz domestique (...",Non,50.04475,1.52562,2023.0
83230003,83230,R,Rond-Point la Baou,BORMES-LES-MIMOSAS,"{""@automate-24-24"": """", ""jour"": [{""@id"": ""1"", ...","43.139, 6.353",2024-01-02 06:36:06+00:00,3.0,0.999,E85,...,Var,93.0,Provence-Alpes-Côte d'Azur,83070,Le Lavandou,"Toilettes publiques,Relais colis,Boutique alim...",Non,43.139,6.353,2024.0
33140001,33140,R,Route de Toulouse,VILLENAVE-D'ORNON,"{""@automate-24-24"": ""1"", ""jour"": [{""@id"": ""1"",...","44.774, -0.563",2024-02-03 08:01:13+00:00,6.0,1.859,SP98,...,Gironde,75.0,Nouvelle-Aquitaine,33550,Villenave-d'Ornon,"Station de gonflage,Piste poids lourds,Vente d...",Oui,44.774,-0.563,2024.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33419 entries, 12270001 to 63000015
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   cp                       33419 non-null  int64              
 1   pop                      33419 non-null  object             
 2   adresse                  33419 non-null  object             
 3   ville                    33419 non-null  object             
 4   horaires                 25957 non-null  object             
 5   geom                     33419 non-null  object             
 6   prix_maj                 33419 non-null  datetime64[ns, UTC]
 7   prix_id                  33419 non-null  float64            
 8   prix_valeur              33419 non-null  float64            
 9   prix_nom                 33419 non-null  object             
 10  com_arm_code             33411 non-null  object             
 11  com_arm_name           

In [3]:
df.nunique()

cp                          4305
pop                            2
adresse                     9177
ville                       6916
horaires                    1807
geom                        9670
prix_maj                   14675
prix_id                        6
prix_valeur                  784
prix_nom                       6
com_arm_code                5507
com_arm_name                5458
epci_code                   1213
epci_name                   1211
dep_code                      96
dep_name                      96
reg_code                      13
reg_name                      13
com_code                    5474
com_name                    5425
services_service            5446
horaires_automate_24_24        2
latitude                    7568
longitude                   7996
year                           3
dtype: int64

In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='prix_nom')

How's your French? Mine is terrible; this is most likely the product name; we are going to assume it is. We have a LOT of rows in our dataset, and if we try to plot all of them many times we will get an unresponsive visualization.

Also, as we can see from this histogram, the number of prices we have per product varies by one and a half orders of magnitude.

In [5]:
from plotly.express import scatter_mapbox
for prix_nom in df['prix_nom'].dropna().unique().tolist():
    scatter_mapbox(data_frame=df[df['prix_nom'] == prix_nom], lat='latitude', lon='longitude', color='prix_valeur',
                   hover_data=['ville'], hover_name='adresse', title='Prices for {}'.format(prix_nom),
                   mapbox_style='open-street-map', zoom=5, height=1000, ).show()

What did we learn from this visualization? 
1. The availability of fuel types varies significantly from type to type
2. The most available is gazole, which in English is diesel.
3. All fuel types are more expensive in and around Paris.
4. Corsica is still part of France, and only some types of fuel are available there, and what is available is almost always more expensive than it is on the mainland.


In [6]:
histogram(data_frame=df, x='prix_valeur', color='prix_nom')

Our price distributions are bimodal, with some fuels roughly half the price of others.

In [7]:
histogram(data_frame=df[['prix_nom', 'prix_valeur']].groupby(by='prix_nom').mean().reset_index(), x='prix_nom', y='prix_valeur')

In [8]:
from plotly.express import scatter
scatter(data_frame=df[df['year'] == 2024], x='prix_maj', y='prix_valeur', color='prix_nom', trendline='ols')

Are fuel prices rising in the new year? Yes it appears they are. They certainly seem to be more volatile in the most recent weekly sample than in the other weekly samples in 2024, and the coefficients in all of our OLS models are positive.