In [50]:
import pandas as pd
import os

In [42]:
culture_to_keep = ["VI", "VE", "TP", "T", "S", "PP", "PH", "PE", "PC","PA", "P", "L"]
cols = {
    "date_mutation": str,
    "nature_mutation": str,
    "valeur_fonciere": float,
    "code_commune": str,
    "id_parcelle": str,
    "code_nature_culture": str,
    "nature_culture": str,
    "code_nature_culture_speciale": str,
    "nature_culture_speciale": str,
    "surface_terrain": float,
    "longitude": float,
    "latitude": float,
}

def load_dvf(year):
    _ = pd.read_csv(
        f'https://files.data.gouv.fr/geo-dvf/latest/csv/{year}/full.csv.gz',
        compression="gzip",
        usecols=list(cols.keys()),
        dtype=cols,
    )
    _ = _.loc[_['code_nature_culture'].isin(culture_to_keep)]
    _['code_departement'] = _['code_commune'].str.slice(0, 2)
    _["month"] = _["date_mutation"].str.slice(0, 7)
    return _

In [35]:
df = load_dvf(2024)
df

Unnamed: 0,date_mutation,nature_mutation,valeur_fonciere,code_commune,id_parcelle,code_nature_culture,nature_culture,code_nature_culture_speciale,nature_culture_speciale,surface_terrain,longitude,latitude,code_dep,month
0,2024-01-02,Vente,346.5,01076,010760000B0514,P,prés,,,99.0,5.530952,45.952439,01,2024-01
1,2024-01-03,Vente,10000.0,01103,011030000B1782,S,sols,,,115.0,6.043339,46.282256,01,2024-01
2,2024-01-08,Vente,249000.0,01203,012030000C1065,S,sols,,,497.0,4.911143,46.247235,01,2024-01
10,2024-01-09,Vente,20000.0,01185,011851860A0082,P,prés,,,2615.0,5.540564,45.892555,01,2024-01
18,2024-01-09,Vente,20000.0,01185,011851860A0307,L,landes,,,2496.0,5.542666,45.903499,01,2024-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1566392,2024-06-28,Vente,2700000.0,75113,75113000BK0002,S,sols,,,131.0,2.368075,48.832248,75,2024-06
1566393,2024-06-28,Vente,2700000.0,75113,75113000BK0002,S,sols,,,131.0,2.368075,48.832248,75,2024-06
1566394,2024-06-28,Vente,2700000.0,75113,75113000BK0002,S,sols,,,131.0,2.368075,48.832248,75,2024-06
1566395,2024-06-28,Vente,2700000.0,75113,75113000BK0002,S,sols,,,131.0,2.368075,48.832248,75,2024-06


In [7]:
# df[['nature_culture', 'nature_culture_speciale']].value_counts(dropna=False).reset_index().to_csv('natures_speciales.csv', index=False)

In [52]:
def create_stats(df):
    dfs = []
    for echelle in [
        "departement",
#         "commune",
    ]:
        grouped = df.groupby(
            [f"code_{echelle}", "month", "nature_culture"]
        )["nature_culture"]

        nb = grouped.count()
        nb.name = "nb_mutations"
        nb = nb.reset_index()
        nb["echelle"] = echelle
        nb.rename({f"code_{echelle}": "code_geo"}, axis=1, inplace=True)
        dfs.append(nb)
    return pd.concat(dfs, ignore_index=True)

In [None]:
if os.path.isfile('stats.csv'):
    os.remove('stats.csv')
for year in range(2019, 2025):
    stats = create_stats(load_dvf(year))
    stats.to_csv(
        "stats.csv",
        index=False,
        mode="w" if year == 2019 else "a",
    )
    print("Done with", year)

Done with 2019
Done with 2020
Done with 2021
Done with 2022
Done with 2023


In [None]:
stats = pd.read_csv('stats.csv',dtype={"code_geo": str})
stats