In [43]:
import requests
import pandas as pd
import numpy as np
#import eurostat
import json
import os
import datetime as dt
from pathlib import Path
from scipy import stats


#path = Path("/Users/lorenz/Projects/GaR")

In [44]:
import os
print("Working directory:", os.getcwd())

Working directory: /Users/lorenz/Projects/GaR/src


In [45]:
os.chdir('/Users/lorenz/Projects/GaR/src')
#path = 'data'

In [46]:
# IMPORT INDUSTRIAL PRODUCTION (IP) DATA 

ip = pd.read_csv("ip_by_sector.csv")

ip = ip[['nace_r2', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'unit']]

ip['Time'] = ip['TIME_PERIOD']

ip['Time'] = pd.to_datetime(ip['Time'], format="%Y-%m")  

# Ensure TIME_PERIOD is datetime
ip['TIME_PERIOD'] = pd.to_datetime(ip['TIME_PERIOD'])

# Filter for dates between January 2004 and December 2023 (inclusive)
start_date = "2004-01-01"
end_date = "2023-12-31"

df = ip[(ip['TIME_PERIOD'] >= start_date) & (ip['TIME_PERIOD'] <= end_date)]

#ip = ip.loc['2004-01':'2023-12']

#ip['Time'] = ip['Time'].dt.strftime('%Y/%m')

ip.set_index('Time', inplace=True)

In [47]:
# Define the series of interest
target_series = [
    'MIG - intermediate goods',
    'MIG - durable consumer goods',
    'MIG - non-durable consumer goods',
    'MIG - energy (except section E)',
    'MIG - capital goods',
    'Manufacture of chemicals and chemical products; basic pharmaceutical products and pharmaceutical preparations',
    'Manufacture of rubber and plastic products and other non-metallic mineral products',
    'Manufacture of wood, paper, printing and reproduction',
    'Manufacture of food products and beverages',
    'Manufacture of basic metals and fabricated metal products, except machinery and equipment',
    'Mining and quarrying; manufacturing; electricity, gas, steam and air conditioning supply'
]

# Filter the DataFrame
filtered_ip = ip[ip['nace_r2'].isin(target_series)]

filtered_ip = filtered_ip.copy()
filtered_ip['TIME_PERIOD'] = pd.to_datetime(filtered_ip['TIME_PERIOD'], format="%Y-%m")

# Save the filtered dataset
#filtered_ip.to_csv("filtered_ip_sectors.csv", index=False)

In [48]:
# IMPORT GAS PRICES DATA 

gasp = pd.read_csv("gas_data.csv")

#gasp[['Prices']] = gasp[['LSG Natural Gas TTF NL 1st Fut. Day - SETT. PRICE']]
#
#gasp = gasp[['Time','Prices']]
#
#gasp['Time'] = pd.to_datetime(gasp['Time'], format="%m/%d/%y")
#
#month_gasp = gasp.groupby(gasp['Time'].dt.to_period('M')).mean() # group data and make monthly average to match IP data
#
#month_gasp = month_gasp.drop(columns=['Time'])
#
#month_gasp.loc['2004-01':'2023-12']

In [49]:
# Extract and rename gas prices
gasp['Prices'] = gasp['LSG Natural Gas TTF NL 1st Fut. Day - SETT. PRICE']
gasp = gasp[['Time', 'Prices']]
gasp['Time'] = pd.to_datetime(gasp['Time'], format="%m/%d/%y")

# Compute monthly average prices
month_gasp = gasp.groupby(gasp['Time'].dt.to_period('M')).mean()
month_gasp.index = month_gasp.index.to_timestamp()
month_gasp = month_gasp.loc['2004-01':'2023-12']  # Limit range to match IP data

# --- Merge monthly gas prices with IP data ---

df = filtered_ip.merge(month_gasp, left_on='TIME_PERIOD', right_index=True, how='left')

In [39]:
#df = df.drop(columns=['TIME_PERIOD'])

df['geo'] = df['geo'].astype('category')

df['nace_r2'] = df['nace_r2'].astype('category')

df['unit'] = df['unit'].astype('category')

df_ue = df[df['geo'] == 'European Union - 27 countries (from 2020)']

#df_ue_c = df_ue[df_ue['nace_r2'] == 'C']

#df_ue = df_ue[df_ue['unit'] == 'I15']

df_ue['GAS_PRICE'] = month_gasp['Prices']

In [50]:
df_ue.to_csv("/Users/lorenz/Library/CloudStorage/OneDrive-UniversitàdegliStudidiMilano/PhD/Progetto/matlab/codice_ipgas/databysector.csv", index=False)

In [51]:
# Optional: create a folder to store the outputs
output_folder = "/Users/lorenz/Library/CloudStorage/OneDrive-UniversitàdegliStudidiMilano/PhD/Progetto/matlab/codice_ipgas/sector_datasets/"
os.makedirs(output_folder, exist_ok=True)

# Loop through each sector
for series_name, group in df.groupby('nace_r2'):
    df = group.copy()

    # Rename columns
    df = df.rename(columns={"OBS_VALUE": "IP", "Prices": "GAS_PRICE"})

    # Coerce IP and GAS_PRICE to numeric
    df["IP"] = pd.to_numeric(df["IP"], errors="coerce")
    df["GAS_PRICE"] = pd.to_numeric(df["GAS_PRICE"], errors="coerce")

    # Sort by time to ensure proper lag alignment
    df = df.sort_values("TIME_PERIOD")

    # Lagged and differenced variables
    df['IP_t1'] = df['IP'].shift(1)
    df['IP_t1'] = df['IP_t1'].interpolate(method="nearest").ffill().bfill()
    #df['IP_diffed'] = df['IP'] - df['IP_t1']

    df['GAS_PRICE_t1'] = df['GAS_PRICE'].shift(1)
    df['GAS_PRICE_t1'] = df['GAS_PRICE_t1'].interpolate(method="nearest").ffill().bfill()
    #df['GAS_PRICE_diffed'] = df['GAS_PRICE'] - df['GAS_PRICE_t1']

    # Clean filename
    safe_name = series_name.replace(" ", "_").replace(";", "").replace(",", "").replace("/", "_")
    filename = f"{safe_name}.csv"

    # Save to CSV
    df.to_csv(os.path.join(output_folder, filename), index=False)
    print(f"Saved: {filename}")

Saved: MIG_-_capital_goods.csv
Saved: MIG_-_durable_consumer_goods.csv
Saved: MIG_-_energy_(except_section_E).csv
Saved: MIG_-_intermediate_goods.csv
Saved: MIG_-_non-durable_consumer_goods.csv
Saved: Manufacture_of_basic_metals_and_fabricated_metal_products_except_machinery_and_equipment.csv
Saved: Manufacture_of_chemicals_and_chemical_products_basic_pharmaceutical_products_and_pharmaceutical_preparations.csv
Saved: Manufacture_of_food_products_and_beverages.csv
Saved: Manufacture_of_rubber_and_plastic_products_and_other_non-metallic_mineral_products.csv
Saved: Manufacture_of_wood_paper_printing_and_reproduction.csv
Saved: Mining_and_quarrying_manufacturing_electricity_gas_steam_and_air_conditioning_supply.csv
