In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Let's get the data
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

In [3]:
# What countries?
train.country.unique()

array(['Finland', 'Norway', 'Sweden'], dtype=object)

In [4]:
# What about in test?
test.country.unique()

array(['Finland', 'Norway', 'Sweden'], dtype=object)

In [5]:
# Train time frame
train.date.min(), train.date.max()

('2015-01-01', '2018-12-31')

In [6]:
# Test time frame
test.date.min(), test.date.max()

('2019-01-01', '2019-12-31')

# Festivities

Festivities in time series analysis are important. If the data is realistic they also should be important in this competition. Let's have a glance at how we can get them for the different countries.

In [7]:
holiday_list = list()

In [8]:
import holidays
import dateutil.easter as easter

print("--- FINLAND ---")
for date in holidays.Finland(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    print(str(date[0]), date[1])
    holiday_list.append([date[0], date[1], "Finland"])

--- FINLAND ---
2016-01-01 Uudenvuodenpäivä
2016-01-06 Loppiainen
2016-03-25 Pitkäperjantai
2016-03-27 Pääsiäispäivä
2016-03-28 2. pääsiäispäivä
2016-05-01 Vappu
2016-05-05 Helatorstai
2016-05-15 Helluntaipäivä
2016-06-25 Juhannuspäivä
2016-11-05 Pyhäinpäivä
2016-12-06 Itsenäisyyspäivä
2016-12-25 Joulupäivä
2016-12-26 Tapaninpäivä
2016-06-24 Juhannusaatto
2016-12-24 Jouluaatto
2017-01-01 Uudenvuodenpäivä
2017-01-06 Loppiainen
2017-04-14 Pitkäperjantai
2017-04-16 Pääsiäispäivä
2017-04-17 2. pääsiäispäivä
2017-05-01 Vappu
2017-05-25 Helatorstai
2017-06-04 Helluntaipäivä
2017-06-24 Juhannuspäivä
2017-11-04 Pyhäinpäivä
2017-12-06 Itsenäisyyspäivä
2017-12-25 Joulupäivä
2017-12-26 Tapaninpäivä
2017-06-23 Juhannusaatto
2017-12-24 Jouluaatto
2018-01-01 Uudenvuodenpäivä
2018-01-06 Loppiainen
2018-03-30 Pitkäperjantai
2018-04-01 Pääsiäispäivä
2018-04-02 2. pääsiäispäivä
2018-05-01 Vappu
2018-05-10 Helatorstai
2018-05-20 Helluntaipäivä
2018-06-23 Juhannuspäivä
2018-11-03 Pyhäinpäivä
2018-12-06 It

In [9]:
print("--- NORWAY ---")
for date in holidays.Norway(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    print(str(date[0]), date[1])
    holiday_list.append([date[0], date[1], "Norway"])

--- NORWAY ---
2016-01-01 Første nyttårsdag
2016-05-01 Arbeidernes dag
2016-05-17 Grunnlovsdag
2016-12-25 Første juledag
2016-12-26 Andre juledag
2016-03-24 Skjærtorsdag
2016-03-25 Langfredag
2016-03-27 Første påskedag
2016-03-28 Andre påskedag
2016-05-05 Kristi himmelfartsdag
2016-05-15 Første pinsedag
2016-05-16 Andre pinsedag
2017-01-01 Første nyttårsdag
2017-05-01 Arbeidernes dag
2017-05-17 Grunnlovsdag
2017-12-25 Første juledag
2017-12-26 Andre juledag
2017-04-13 Skjærtorsdag
2017-04-14 Langfredag
2017-04-16 Første påskedag
2017-04-17 Andre påskedag
2017-05-25 Kristi himmelfartsdag
2017-06-04 Første pinsedag
2017-06-05 Andre pinsedag
2018-01-01 Første nyttårsdag
2018-05-01 Arbeidernes dag
2018-05-17 Grunnlovsdag
2018-12-25 Første juledag
2018-12-26 Andre juledag
2018-03-29 Skjærtorsdag
2018-03-30 Langfredag
2018-04-01 Første påskedag
2018-04-02 Andre påskedag
2018-05-10 Kristi himmelfartsdag
2018-05-20 Første pinsedag
2018-05-21 Andre pinsedag
2019-01-01 Første nyttårsdag
2019-05-

In [10]:
print("--- SWEDEN ---")
for date in holidays.Sweden(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    if date[1]!='Söndag':
        print(str(date[0]), date[1].replace(", Söndag", ""))
        holiday_list.append([date[0], date[1].replace(", Söndag", ""), "Sweden"])

--- SWEDEN ---
2016-03-27 Påskdagen
2016-05-01 Första maj
2016-05-15 Pingstdagen
2016-12-25 Juldagen
2016-01-01 Nyårsdagen
2016-01-06 Trettondedag jul
2016-06-06 Sveriges nationaldag
2016-12-24 Julafton
2016-12-26 Annandag jul
2016-12-31 Nyårsafton
2016-03-25 Långfredagen
2016-03-28 Annandag påsk
2016-05-05 Kristi himmelsfärdsdag
2016-06-24 Midsommarafton
2016-06-25 Midsommardagen
2016-11-05 Alla helgons dag
2017-01-01 Nyårsdagen
2017-04-16 Påskdagen
2017-06-04 Pingstdagen
2017-12-24 Julafton
2017-12-31 Nyårsafton
2017-01-06 Trettondedag jul
2017-05-01 Första maj
2017-06-06 Sveriges nationaldag
2017-12-25 Juldagen
2017-12-26 Annandag jul
2017-04-14 Långfredagen
2017-04-17 Annandag påsk
2017-05-25 Kristi himmelsfärdsdag
2017-06-23 Midsommarafton
2017-06-24 Midsommardagen
2017-11-04 Alla helgons dag
2018-04-01 Påskdagen
2018-05-20 Pingstdagen
2018-01-01 Nyårsdagen
2018-01-06 Trettondedag jul
2018-05-01 Första maj
2018-06-06 Sveriges nationaldag
2018-12-24 Julafton
2018-12-25 Juldagen
201

Let's add some special dates and events meaningful for the competition

In [11]:
# Last week of the year
for year in [2015, 2016, 2017, 2018, 2019]:
    for i, day in enumerate(range(24, 32)):
        for country in ['Finland', 'Sweden', 'Norway']:
             holiday_list.append([pd.to_datetime(f"{year}-{12}-{day}").date(), 
                                  f"Last week of the year (day {i+1})", 
                                  country])
# Swedish Rock Concert
for start, end, year in [[3,6,2015],[8,11,2016],[7,10,2017],[6,10,2018],[5,8,2019]]:
    for i, day in enumerate(range(start, end+1)):
        holiday_list.append([pd.to_datetime(f"{year}-{6}-{day}").date(), 
                                  f"Swedish Rock Concert (day {i+1})", 
                                  "Sweden"])
        
# Last Wednesday of June
for date in ['2015-06-24', '2016-06-29', '2017-06-28', '2018-06-27', '2019-06-26']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Last Wednesday of June", 
                                  country])
            
# First Sunday of November
for date in ['2015-11-1', '2016-11-6', '2017-11-5', '2018-11-4', '2019-11-3']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"First Sunday of November", 
                                  country])
            
# Independence Day of Finland
for year in [2015, 2016, 2017, 2018, 2019]:
    holiday_list.append([pd.to_datetime(f"{year}-{12}-{6}").date(), 
                                      f"Independence Day of Finland", 
                                      'Finland'])

# Easter
easter_date = [easter.easter(y) for y in [2015, 2016, 2017, 2018, 2019]]
for date in easter_date:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Easter", 
                                  country])
    

Finally, let's turn all the dates into a pandas DataFrame.

In [12]:
holidays_df = pd.DataFrame(holiday_list, columns=['date', 'holiday', 'country'])
holidays_df = holidays_df.drop_duplicates(['date', 'country'], keep='first')
holidays_df.to_csv("nordic_holidays.csv")

## Happy Kaggling!