In [18]:
import pandas as pd

**HICP - all items - annual average indices**

---
Harmonised Indices of Consumer Prices (HICPs) are designed for international comparisons of consumer price inflation. HICP is used by e.g. the European Central Bank for monitoring of inflation in the Economic and Monetary Union and for the assessment of inflation convergence as required under Article 121 of the Treaty of Amsterdam.
https://ec.europa.eu/eurostat/databrowser/view/tec00027/default/table?lang=en

In [19]:
df = pd.read_csv('../../data/HICP_annual_indices_for_all_goods.csv')

In [20]:
df.head()

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,coicop,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG,CONF_STATUS
0,ESTAT:TEC00027(1.0),19/03/25 11:00:00,Annual,Annual average index,All-items HICP,Albania,2016,101.51,d,
1,ESTAT:TEC00027(1.0),19/03/25 11:00:00,Annual,Annual average index,All-items HICP,Albania,2017,104.76,d,
2,ESTAT:TEC00027(1.0),19/03/25 11:00:00,Annual,Annual average index,All-items HICP,Albania,2018,106.59,d,
3,ESTAT:TEC00027(1.0),19/03/25 11:00:00,Annual,Annual average index,All-items HICP,Albania,2019,108.39,d,
4,ESTAT:TEC00027(1.0),19/03/25 11:00:00,Annual,Annual average index,All-items HICP,Albania,2020,110.74,d,


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479 entries, 0 to 478
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     479 non-null    object 
 1   LAST UPDATE  479 non-null    object 
 2   freq         479 non-null    object 
 3   unit         479 non-null    object 
 4   coicop       479 non-null    object 
 5   geo          479 non-null    object 
 6   TIME_PERIOD  479 non-null    int64  
 7   OBS_VALUE    479 non-null    float64
 8   OBS_FLAG     76 non-null     object 
 9   CONF_STATUS  0 non-null      float64
dtypes: float64(2), int64(1), object(7)
memory usage: 37.6+ KB


In [22]:
columns_to_drop = ["OBS_FLAG", "DATAFLOW", "CONF_STATUS", "freq", "incgrp", "unit", "LAST UPDATE", "coicop"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

In [23]:
df = df.dropna()

In [24]:
for col in df.columns:
    print(f"Uniques values in {col}: {df[col].unique()}")

Uniques values in geo: ['Albania' 'Austria' 'Belgium' 'Bulgaria' 'Switzerland' 'Cyprus' 'Czechia'
 'Germany' 'Denmark' 'Euro area - 19 countries  (2015-2022)'
 'Euro area – 20 countries (from 2023)' 'Estonia' 'Greece' 'Spain'
 'European Union - 27 countries (from 2020)' 'Finland' 'France' 'Croatia'
 'Hungary' 'Ireland' 'Iceland' 'Italy' 'Lithuania' 'Luxembourg' 'Latvia'
 'Montenegro' 'North Macedonia' 'Malta' 'Netherlands' 'Norway' 'Poland'
 'Portugal' 'Romania' 'Serbia' 'Sweden' 'Slovenia' 'Slovakia' 'Türkiye'
 'United Kingdom' 'United States' 'Kosovo*']
Uniques values in TIME_PERIOD: [2016 2017 2018 2019 2020 2021 2022 2023 2024 2013 2014 2015]
Uniques values in OBS_VALUE: [101.51 104.76 106.59 108.39 110.74 113.26 120.69 127.13 130.38  97.77
  99.2  100.   100.97 103.22 105.41 106.98 108.47 111.46 121.07 130.4
 134.21  98.9   99.38 101.77 104.03 106.44 107.77 108.23 111.71 123.26
 126.07 131.52 102.72 101.08  98.68  99.85 102.48 104.99 106.27 109.3
 123.52 134.15 137.63 100.83 100.8

In [25]:
df = df[~df["geo"].isin(['European Union - 27 countries (from 2020)',
                         'Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)',
                         'Euro area - 17 countries (2011-2013)',
                         'European Union (EU6-1958, EU9-1973, EU10-1981, EU12-1986, EU15-1995, EU25-2004, EU27-2007, EU28-2013, EU27-2020)',
                         'European Union - 15 countries (1995-2004)',
                         'European Union - 27 countries (2007-2013)',
                         'European Union - 27 countries (from 2020)',
                         'European Union - 28 countries (2013-2020)'
                         'Euro area - 19 countries  (2015-2022)',
                         'Euro area – 20 countries (from 2023)',
                         'Euro area - 18 countries (2014)',
                         'Euro area - 19 countries  (2015-2022)',
                         'European Union - 28 countries (2013-2020)'
                         ])]

columns_to_drop = ["freq", "indic_de", "statinfo", "age", "unit", 'isced11']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

In [26]:
for col in df.columns:
    print(f"Uniques values in {col}: {df[col].unique()}")

Uniques values in geo: ['Albania' 'Austria' 'Belgium' 'Bulgaria' 'Switzerland' 'Cyprus' 'Czechia'
 'Germany' 'Denmark' 'Estonia' 'Greece' 'Spain' 'Finland' 'France'
 'Croatia' 'Hungary' 'Ireland' 'Iceland' 'Italy' 'Lithuania' 'Luxembourg'
 'Latvia' 'Montenegro' 'North Macedonia' 'Malta' 'Netherlands' 'Norway'
 'Poland' 'Portugal' 'Romania' 'Serbia' 'Sweden' 'Slovenia' 'Slovakia'
 'Türkiye' 'United Kingdom' 'United States' 'Kosovo*']
Uniques values in TIME_PERIOD: [2016 2017 2018 2019 2020 2021 2022 2023 2024 2013 2014 2015]
Uniques values in OBS_VALUE: [101.51 104.76 106.59 108.39 110.74 113.26 120.69 127.13 130.38  97.77
  99.2  100.   100.97 103.22 105.41 106.98 108.47 111.46 121.07 130.4
 134.21  98.9   99.38 101.77 104.03 106.44 107.77 108.23 111.71 123.26
 126.07 131.52 102.72 101.08  98.68  99.85 102.48 104.99 106.27 109.3
 123.52 134.15 137.63 100.83 100.84  99.47 100.11 101.03 101.41 100.56
 101.04 103.74 106.1  107.25 101.84 101.57  98.78  99.45 100.23 100.78
  99.67 101.92 11

In [27]:
df = df.rename(columns={"OBS_VALUE": "HICP value", "TIME_PERIOD": "Time period"})

In [28]:
df.to_csv("../../processed_data/HICP_annual_indices_for_all_goods.csv", index=False)