# Modeling and Regression Analysis

## Datasets

* [namq_10_gdp](https://doi.org/10.2908/NAMQ_10_GDP) - Eurostat: Quarterly GDP

* [prc_hicp_manr](https://doi.org/10.2908/PRC_HICP_MANR) - Eurostat: Monthly CPI inflation rate

* [lfsa_egan](https://doi.org/10.2908/LFSA_EGAN) - Eurostat: Annual employment


In [218]:
import os
from typing import Literal

import eurostat
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [219]:
COUNTRY_NAME = "Iceland"
COUNTRY_CODE = "IS"

ORIGINAL_DATA_DIR = "original_data"

In [220]:
def get_dataframe(eurostat_code: str, country_code: str = COUNTRY_CODE) -> pd.DataFrame:
    file_path = os.path.join(ORIGINAL_DATA_DIR, f"{eurostat_code}.csv")

    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

    else:
        df = eurostat.get_data_df(eurostat_code)
        os.makedirs(ORIGINAL_DATA_DIR, exist_ok=True)
        df.to_csv(file_path, index=False)

    df = df[df["geo\\TIME_PERIOD"] == country_code]
    df.rename(columns={"geo\\TIME_PERIOD": "geo"}, inplace=True)

    return df


In [221]:
def fix_dates(src_df: pd.DataFrame) -> pd.DataFrame:
    df = src_df.copy()

    column_names = df.columns.tolist()
    time_period_index = column_names.index("geo")
    data_columns = column_names[time_period_index + 1 :]

    if df["freq"].iloc[0] == "M":
        df.rename(columns={col: f"{col[:4]}-{col[5:7]}-01" for col in data_columns}, inplace=True)
    elif df["freq"].iloc[0] == "Q":
        df.rename(columns={col: f"{col[:4]}-{(int(col[6]) - 1) * 3 + 1:02d}-01" for col in data_columns}, inplace=True)
    elif df["freq"].iloc[0] == "A":
        df.rename(columns={col: f"{col}-01-01" for col in data_columns}, inplace=True)

    return df


In [222]:
def remove_empty_columns(src_df: pd.DataFrame) -> pd.DataFrame:
    df = src_df.copy()
    df.dropna(axis=1, how="all", inplace=True)

    return df


In [223]:
def melt_dataframe(
    src_df: pd.DataFrame,
) -> pd.DataFrame:
    column_names = src_df.columns.tolist()
    time_period_index = column_names.index("geo")
    data_columns = column_names[time_period_index + 1 :]
    id_cols = column_names[: time_period_index + 1]

    df_melted = src_df.melt(id_vars=id_cols, value_vars=data_columns, var_name="time", value_name="value")
    df_melted = df_melted.drop(columns=["geo"])
    df_melted = df_melted.dropna(subset=["value"])
    df_melted["value"] = pd.to_numeric(df_melted["value"], errors="coerce")

    return df_melted


In [224]:
def drop_columns(src_df: pd.DataFrame, columns_to_drop: list[str]) -> pd.DataFrame:
    df = src_df.copy()
    df.drop(columns=columns_to_drop, inplace=True)

    return df


In [225]:
def filter_dataframe(src_df: pd.DataFrame, filter: dict[str, list[str]]) -> pd.DataFrame:
    df = src_df.copy()

    for column, values in filter.items():
        df = df[df[column].isin(values)]

    return df


In [226]:
def split_dataframe(
    df: pd.DataFrame,
    x_cols: list[str],
    y_col: str,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    data_train, data_test = train_test_split(df, test_size=0.2, shuffle=False)

    return data_train[x_cols], data_train[y_col], data_test[x_cols], data_test[y_col]


In [227]:
def create_model_from_data(
    x_df: pd.DataFrame,
    y_df: pd.DataFrame,
    model_type: Literal["linear_regression", "lasso_regression", "random_forest_regression"]
) -> object:
    # Train the model
    if model_type == "linear_regression":
        model = LinearRegression()
    elif model_type == "lasso_regression":
        model = Lasso()
    elif model_type == "random_forest_regression":
        model = RandomForestRegressor()
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    model.fit(x_df, y_df)
    return model


## namq_10_gdp -- Quarterly GDP

`CLV10_MEUR` - Chain linked volumes, base year 2010

`SCA` - Seasonally adjusted data

`B1GQ` - Gross domestic product at market prices


In [228]:
namq_10_gdp = get_dataframe("namq_10_gdp")
namq_10_gdp = fix_dates(namq_10_gdp)
namq_10_gdp = remove_empty_columns(namq_10_gdp)
namq_10_gdp = melt_dataframe(namq_10_gdp)
namq_10_gdp = drop_columns(namq_10_gdp, ["freq"])
namq_10_gdp = filter_dataframe(namq_10_gdp, {"unit": ["CLV10_MEUR"], "na_item": ["B1GQ"], "s_adj": ["SCA"]})

display(namq_10_gdp.head())


Unnamed: 0,unit,s_adj,na_item,time,value
95,CLV10_MEUR,SCA,B1GQ,1995-01-01,1578.1
1290,CLV10_MEUR,SCA,B1GQ,1995-04-01,1610.4
2485,CLV10_MEUR,SCA,B1GQ,1995-07-01,1625.4
3680,CLV10_MEUR,SCA,B1GQ,1995-10-01,1613.9
4875,CLV10_MEUR,SCA,B1GQ,1996-01-01,1671.2


### prc_hicp_manr -- Monthly CPI inflation rate

`RCH_A` - Rate of change (annual)

`CP00` - All-items index


In [229]:
prc_hicp_manr = get_dataframe("prc_hicp_manr")
prc_hicp_manr = fix_dates(prc_hicp_manr)
prc_hicp_manr = remove_empty_columns(prc_hicp_manr)
prc_hicp_manr = melt_dataframe(prc_hicp_manr)
prc_hicp_manr = drop_columns(prc_hicp_manr, ["freq"])
prc_hicp_manr = filter_dataframe(prc_hicp_manr, {"unit": ["RCH_A"], "coicop": ["CP00"]})

display(prc_hicp_manr.head())


Unnamed: 0,unit,coicop,time,value
0,RCH_A,CP00,1997-01-01,2.0
364,RCH_A,CP00,1997-02-01,1.9
728,RCH_A,CP00,1997-03-01,1.7
1092,RCH_A,CP00,1997-04-01,2.2
1456,RCH_A,CP00,1997-05-01,1.5


### lfsa_egan -- Anual employment

`THS_A` - Thousands of persons

`T` - Total

`Y15-64` - Age group 15-64 years

`TOTAL` - Total economic activity


In [230]:
lfsa_egan = get_dataframe("lfsa_egan")
lfsa_egan = fix_dates(lfsa_egan)
lfsa_egan = remove_empty_columns(lfsa_egan)
lfsa_egan = melt_dataframe(lfsa_egan)
lfsa_egan = drop_columns(lfsa_egan, ["freq"])
lfsa_egan = filter_dataframe(lfsa_egan, {"unit": ["THS_PER"], "age": ["Y15-64"], "sex": ["T"], "citizen": ["TOTAL"]})

display(lfsa_egan.head())


Unnamed: 0,unit,sex,age,citizen,time,value
439,THS_PER,T,Y15-64,TOTAL,1995-01-01,132.5
1058,THS_PER,T,Y15-64,TOTAL,1996-01-01,133.0
1677,THS_PER,T,Y15-64,TOTAL,1997-01-01,132.2
2296,THS_PER,T,Y15-64,TOTAL,1998-01-01,137.7
2915,THS_PER,T,Y15-64,TOTAL,1999-01-01,143.7


In [231]:
namq_10_gdp_r = namq_10_gdp[["time", "value"]].rename(columns={"value": "gdp_volume"})
prc_hicp_manr_r = prc_hicp_manr[["time", "value"]].rename(columns={"value": "cpi_inflation_rate"})
lfsa_egan_r = lfsa_egan[["time", "value"]].rename(columns={"value": "employment_thousands"})

data_r = pd.merge(
    namq_10_gdp_r,
    prc_hicp_manr_r,
    on="time"
)

data = pd.merge(
    data_r,
    lfsa_egan_r,
    on="time"
)

display(data.head())


Unnamed: 0,time,gdp_volume,cpi_inflation_rate,employment_thousands
0,1997-01-01,1728.8,2.0,132.2
1,1998-01-01,1724.7,2.2,137.7
2,1999-01-01,1952.6,0.4,143.7
3,2000-01-01,1996.8,4.6,150.2
4,2001-01-01,2066.4,3.4,151.8


## Training

### gdp_volume

#### Linear Regression

In [232]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["cpi_inflation_rate", "employment_thousands"], "gdp_volume")
model_linear = create_model_from_data(X_train, y_train, model_type="linear_regression")

# Make predictions
y_pred = model_linear.predict(X_test)

# Evaluate the model
gdp_volume_linear_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {gdp_volume_linear_mse}")

gdp_volume_linear_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {gdp_volume_linear_r2}")

gdp_volume_linear_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {gdp_volume_linear_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 52936.37873689895
R^2 Score: 0.08078226440913361
Mean Absolute Error: 168.06982841281933


Unnamed: 0,Actual,Predicted
22,3461.2,3441.759801
23,3280.0,3242.97951
24,3209.5,3298.560943
25,3516.7,3718.641613
26,3850.8,4025.190816
27,3800.4,4286.964909


#### Lasso Regression

In [233]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["cpi_inflation_rate", "employment_thousands"], "gdp_volume")
model_lasso = create_model_from_data(X_train, y_train, model_type="lasso_regression")

# Make predictions
y_pred = model_lasso.predict(X_test)

# Evaluate the model
gdp_volume_lasso_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {gdp_volume_lasso_mse}")

gdp_volume_lasso_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {gdp_volume_lasso_r2}")

gdp_volume_lasso_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {gdp_volume_lasso_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 52801.11121081329
R^2 Score: 0.08313112755380447
Mean Absolute Error: 167.8484407185962


Unnamed: 0,Actual,Predicted
22,3461.2,3441.621666
23,3280.0,3242.943165
24,3209.5,3298.389761
25,3516.7,3718.336984
26,3850.8,4024.658119
27,3800.4,4286.470611


#### Random Forest Regression

In [244]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["cpi_inflation_rate", "employment_thousands", "gdp_volume"], "gdp_volume")
model_random_forest = create_model_from_data(X_train, y_train, model_type="random_forest_regression")

y_pred = model_random_forest.predict(X_test)

# Evaluate the model
gdp_volume_random_forest_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {gdp_volume_random_forest_mse}")

gdp_volume_random_forest_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {gdp_volume_random_forest_r2}")

gdp_volume_random_forest_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {gdp_volume_random_forest_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 263137.7510986699
R^2 Score: -3.5692752977224496
Mean Absolute Error: 452.55433333333673


Unnamed: 0,Actual,Predicted
22,3461.2,3068.458
23,3280.0,3078.449
24,3209.5,3059.313
25,3516.7,3068.536
26,3850.8,3064.259
27,3800.4,3064.259


### cpi_inflation_rate

#### Linear Regression

In [235]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "employment_thousands"], "cpi_inflation_rate")
model_linear = create_model_from_data(X_train, y_train, model_type="linear_regression")

# Make predictions
y_pred = model_linear.predict(X_test)

# Evaluate the model
cpi_inflation_linear_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {cpi_inflation_linear_mse}")

cpi_inflation_linear_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {cpi_inflation_linear_r2}")

cpi_inflation_linear_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {cpi_inflation_linear_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 44.70186894807478
R^2 Score: -6.692114536258745
Mean Absolute Error: 5.319648849522637


Unnamed: 0,Actual,Predicted
22,1.9,2.661825
23,0.6,3.037441
24,3.5,1.215217
25,4.3,-1.208952
26,8.0,-0.56569
27,5.6,-6.759203


#### Lasso Regression

In [236]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "employment_thousands"], "cpi_inflation_rate")
model_lasso = create_model_from_data(X_train, y_train, model_type="lasso_regression")

# Make predictions
y_pred = model_lasso.predict(X_test)

# Evaluate the model
cpi_inflation_lasso_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {cpi_inflation_lasso_mse}")

cpi_inflation_lasso_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {cpi_inflation_lasso_r2}")

cpi_inflation_lasso_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {cpi_inflation_lasso_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 35.73838628898397
R^2 Score: -5.149715149387804
Mean Absolute Error: 4.817707373724757


Unnamed: 0,Actual,Predicted
22,1.9,2.779047
23,0.6,3.111661
24,3.5,1.577127
25,4.3,-0.497709
26,8.0,0.018247
27,5.6,-5.213201


#### Random Forest Regression

In [237]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "employment_thousands"], "cpi_inflation_rate")
model_random_forest = create_model_from_data(X_train, y_train, model_type="random_forest_regression")

y_pred = model_random_forest.predict(X_test)

# Evaluate the model
cpi_inflation_random_forest_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {cpi_inflation_random_forest_mse}")

cpi_inflation_random_forest_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {cpi_inflation_random_forest_r2}")

cpi_inflation_random_forest_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {cpi_inflation_random_forest_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 26.279667500000006
R^2 Score: -3.5220975574781326
Mean Absolute Error: 4.521166666666667


Unnamed: 0,Actual,Predicted
22,1.9,-0.545
23,0.6,-0.524
24,3.5,-0.523
25,4.3,-0.545
26,8.0,-0.545
27,5.6,-0.545


### employment_thousands

#### Linear Regression

In [238]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "cpi_inflation_rate"], "employment_thousands")
model_linear = create_model_from_data(X_train, y_train, model_type="linear_regression")

# Make predictions
y_pred = model_linear.predict(X_test)

# Evaluate the model
employment_thousands_linear_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {employment_thousands_linear_mse}")

employment_thousands_linear_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {employment_thousands_linear_r2}")

employment_thousands_linear_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {employment_thousands_linear_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 75.62498472313304
R^2 Score: 0.4492657988362687
Mean Absolute Error: 6.408188051222187


Unnamed: 0,Actual,Predicted
22,190.6,189.686263
23,184.6,184.548337
24,185.6,181.617342
25,198.8,190.712183
26,207.6,199.829022
27,216.6,198.957725


#### Lasso Regression

In [239]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "cpi_inflation_rate"], "employment_thousands")
model_lasso = create_model_from_data(X_train, y_train, model_type="lasso_regression")

# Make predictions
y_pred = model_lasso.predict(X_test)

# Evaluate the model
employment_thousands_lasso_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {employment_thousands_lasso_mse}")

employment_thousands_lasso_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {employment_thousands_lasso_r2}")

employment_thousands_lasso_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {employment_thousands_lasso_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 74.57820338657838
R^2 Score: 0.45688891817032373
Mean Absolute Error: 6.390411395189408


Unnamed: 0,Actual,Predicted
22,190.6,189.61762
23,184.6,184.423517
24,185.6,181.61045
25,198.8,190.742909
26,207.6,200.016811
27,216.6,199.046225


#### Random Forest Regression

In [240]:
X_train, y_train, X_test, y_test = split_dataframe(data, ["gdp_volume", "cpi_inflation_rate"], "employment_thousands")
model_random_forest = create_model_from_data(X_train, y_train, model_type="random_forest_regression")

y_pred = model_random_forest.predict(X_test)

# Evaluate the model
employment_thousands_random_forest_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {employment_thousands_random_forest_mse}")

employment_thousands_random_forest_r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {employment_thousands_random_forest_r2}")

employment_thousands_random_forest_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {employment_thousands_random_forest_mae}")

# Display predictions vs actual values
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
display(results)


Mean Squared Error: 492.4415569999957
R^2 Score: -2.5861747080956112
Mean Absolute Error: 18.556333333333228


Unnamed: 0,Actual,Predicted
22,190.6,178.799
23,184.6,181.013
24,185.6,177.623
25,198.8,178.369
26,207.6,178.329
27,216.6,178.329


### Final Results

In [243]:
gdp_volume = {
    "linear_regression": {
        "mse": gdp_volume_linear_mse,
        "r2": gdp_volume_linear_r2,
        "mae": gdp_volume_linear_mae
    },
    "lasso_regression": {
        "mse": gdp_volume_lasso_mse,
        "r2": gdp_volume_lasso_r2,
        "mae": gdp_volume_lasso_mae
    },
    "random_forest_regression": {
        "mse": gdp_volume_random_forest_mse,
        "r2": gdp_volume_random_forest_r2,
        "mae": gdp_volume_random_forest_mae
    }
}

cpi_inflation_rate = {
    "linear_regression": {
        "mse": cpi_inflation_linear_mse,
        "r2": cpi_inflation_linear_r2,
        "mae": cpi_inflation_linear_mae
    },
    "lasso_regression": {
        "mse": cpi_inflation_lasso_mse,
        "r2": cpi_inflation_lasso_r2,
        "mae": cpi_inflation_lasso_mae
    },
    "random_forest_regression": {
        "mse": cpi_inflation_random_forest_mse,
        "r2": cpi_inflation_random_forest_r2,
        "mae": cpi_inflation_random_forest_mae
    }
}

employment_thousands = {
    "linear_regression": {
        "mse": employment_thousands_linear_mse,
        "r2": employment_thousands_linear_r2,
        "mae": employment_thousands_linear_mae
    },
    "lasso_regression": {
        "mse": employment_thousands_lasso_mse,
        "r2": employment_thousands_lasso_r2,
        "mae": employment_thousands_lasso_mae
    },
    "random_forest_regression": {
        "mse": employment_thousands_random_forest_mse,
        "r2": employment_thousands_random_forest_r2,
        "mae": employment_thousands_random_forest_mae
    }
}

display(pd.DataFrame(gdp_volume))
display(pd.DataFrame(cpi_inflation_rate))
display(pd.DataFrame(employment_thousands))


Unnamed: 0,linear_regression,lasso_regression,random_forest_regression
mse,52936.378737,52801.111211,249951.855737
r2,0.080782,0.083131,-3.340308
mae,168.069828,167.848441,439.100833


Unnamed: 0,linear_regression,lasso_regression,random_forest_regression
mse,44.701869,35.738386,26.279668
r2,-6.692115,-5.149715,-3.522098
mae,5.319649,4.817707,4.521167


Unnamed: 0,linear_regression,lasso_regression,random_forest_regression
mse,75.624985,74.578203,492.441557
r2,0.449266,0.456889,-2.586175
mae,6.408188,6.390411,18.556333
