In [2]:
import pandas as pd 
import numpy as np 

# Train test spliting 
from sklearn.model_selection import train_test_split

# Linear models 
from sklearn.linear_model import LinearRegression 

# Multilabel binarizer 
from sklearn.preprocessing import MultiLabelBinarizer

# Iteration tracking 
from tqdm import tqdm 

# Raw data reading 

In [3]:
d = pd.read_excel('input/residents_prognozavimui.xlsx') 

In [4]:
d.head()

Unnamed: 0,gender,dateOfBirth,countryOfBirth,registrationDate,maritalStatus,terRegCode,streetCode,building,house,apartment,eldership,eldershipId,subeldership,subeldershipId,streetname,postalcode
0,Vyras,1909-01-18,RUS,1989-12-12,Išsituokęs,G1,272.0,,97,44,Pašilaičiai,7.0,J. Baltrušaičio,704.0,Laisvės pr.,LT-06120
1,Vyras,1913-08-01,LTU,1996-05-09,Vedęs,G1,492.0,,28,35,Fabijoniškės,4.0,Jovaro,401.0,S. Stanevičiaus g.,LT-07102
2,Vyras,1916-07-01,LTU,1993-07-12,Vedęs,G1,549.0,,30,13,Senamiestis,16.0,Universiteto,1604.0,Totorių g.,LT-01144
3,Vyras,1916-09-12,LTU,1999-12-21,Našlys,G1,203.0,,76,41,Justiniškės,8.0,M. Biržiškos,804.0,Justiniškių g.,LT-05233
4,Vyras,1921-06-18,BLR,2017-11-02,Našlys,G1,233.0,,9,9,Žirmūnai,2.0,Kazliškių,202.0,Kazliškių g.,LT-09203


In [5]:
d.dropna(subset=['dateOfBirth', 'registrationDate'], inplace=True) 
d['ageAtRegistration'] = (d['registrationDate'] - d['dateOfBirth']).dt.days // 365

# Binning the ages to bins of 10 years
d['age_bin'] = pd.cut(d['ageAtRegistration'], bins=np.arange(-1, 120, 10))
d['age_bin'] = d['age_bin'].astype(str)

# Converting registrationDate to YYYY-MM format 
d['timestep'] = d['registrationDate'].dt.to_period('M')

# Converting the time steps to integers 
min_timestep = d['timestep'].min()
max_timestep = d['timestep'].max() 
timestep_sequence = pd.period_range(min_timestep, max_timestep, freq='M') 
timestep_df = pd.DataFrame({'timestep': timestep_sequence, 'timestep_int': np.arange(len(timestep_sequence))}) 
d = pd.merge(d, timestep_df, on='timestep', how='left') 

# Creating modeling data 

In [6]:
# Defining a list of dummy features 
dummy_features = ['age_bin', 'gender', 'eldership']

# Dropping the missing rows in the dummy features
d = d.dropna(subset=dummy_features)

dd = d[['timestep_int'] + dummy_features].copy() 
dd = dd.groupby(['timestep_int'] + dummy_features).size().reset_index(name='count')

# Creating the dummy data 
dd = pd.get_dummies(dd, columns=dummy_features)

# Spliting to train and test frames 
train, test = train_test_split(dd, test_size=0.2, random_state=42) 

# Creating the x, y pairs
X_train = train.drop('count', axis=1)
y_train = train['count']

X_test = test.drop('count', axis=1)
y_test = test['count']

print(train.shape, test.shape)

(98111, 36) (24528, 36)


## Linear regression model 

In [7]:
model = LinearRegression()
model.fit(X_train, y_train) 

# Saving the features 
features = X_train.columns

# Predicting 
yhat = model.predict(X_train)

# Calculating the metrics 
errors = y_train - yhat 
abs_errors = np.abs(errors)
sq_errors = errors ** 2 
print('Mean absolute error:', abs_errors.mean())
print('Mean squared error:', sq_errors.mean()) 

# Extracting the coefficients 
coefs = pd.Series(model.coef_, index=features) 
coefs = coefs.sort_values(ascending=False) 

# Adding the intercept
coefs['intercept'] = model.intercept_ 

Mean absolute error: 3.6177190560531662
Mean squared error: 41.4979368659019


In [8]:
# Predicting on the test set 
yhat_test = model.predict(X_test) 

# Calculating the metrics
errors_test = y_test - yhat_test
abs_errors_test = np.abs(errors_test)
sq_errors_test = errors_test ** 2
print('Mean absolute error (test):', abs_errors_test.mean())
print('Mean squared error (test):', sq_errors_test.mean())

Mean absolute error (test): 3.5523331822860436
Mean squared error (test): 38.81258623879962


# Forecasting 

The forecasting strategy is to create dummy presentations for each gender, age group and municipality and to create an integer timestep into the future. 

In [9]:
# Getting maximum date in the dataset 
max_date = d['registrationDate'].max() 

# Defining the number of maximum months to forecast ahead 
n_months = 24 

# Creating the future time steps 
future_timesteps = pd.period_range(max_date, periods=n_months, freq='M') 
future_timesteps_df = pd.DataFrame({'timestep': future_timesteps, 'timestep_int': np.arange(len(timestep_sequence), len(timestep_sequence) + n_months)})

# Extracting the unique timesteps 
unique_timesteps = future_timesteps_df['timestep_int'].unique()

In [10]:
# For each unique dummy feature, getting all the unique values 
dummy_features_values = {}
for dummy_feature in dummy_features:
    dummy_features_values[dummy_feature] = d[dummy_feature].unique().tolist()

# Creating the meshed grid of all possible combinations of the dummy features
from itertools import product
meshed_grid = list(product(*dummy_features_values.values()))

print('Number of possible combinations:', len(meshed_grid))

Number of possible combinations: 462


In [12]:
# Iterating over each timestep to the future and predicing the counts
future_predictions = []
for timestep_int in tqdm(unique_timesteps):
    # Iterating over all the tuples 
    for obs in meshed_grid:
        # Creating the prediction frame
        prediction_df = pd.DataFrame({'timestep_int': [timestep_int], **dict(zip(dummy_features, obs))})

        # Creating the dummy data
        prediction_df = pd.get_dummies(prediction_df, columns=dummy_features)

        # Ensuring the columns are the same as the training columns
        missing_columns = set(features) - set(prediction_df.columns)
        for column in missing_columns:
            prediction_df[column] = 0

        # Sorting the columns
        prediction_df = prediction_df[features]

        # Predicting
        prediction = model.predict(prediction_df) 

        # Appending the prediction
        future_predictions.append({'timestep_int': timestep_int, **dict(zip(dummy_features, obs)), 'count': prediction[0]})

100%|██████████| 24/24 [00:59<00:00,  2.48s/it]


In [13]:
# Creating a dataframe out of the predictions 
future_predictions_df = pd.DataFrame(future_predictions)

# Giving the timesteps the correct format
future_predictions_df = pd.merge(future_predictions_df, future_timesteps_df, on='timestep_int', how='left') 

In [14]:
future_predictions_df

Unnamed: 0,timestep_int,age_bin,gender,eldership,count,timestep
0,908,"(79, 89]",Vyras,Pašilaičiai,4.007964,2024-06
1,908,"(79, 89]",Vyras,Fabijoniškės,2.801377,2024-06
2,908,"(79, 89]",Vyras,Senamiestis,1.223508,2024-06
3,908,"(79, 89]",Vyras,Justiniškės,0.892856,2024-06
4,908,"(79, 89]",Vyras,Žirmūnai,3.174376,2024-06
...,...,...,...,...,...,...
11083,931,"(-1, 9]",Moteris,Verkiai,16.453900,2026-05
11084,931,"(-1, 9]",Moteris,Šnipiškės,11.255871,2026-05
11085,931,"(-1, 9]",Moteris,Naujininkai,13.574821,2026-05
11086,931,"(-1, 9]",Moteris,Vilkpėdė,10.978170,2026-05
