In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import json

df = pd.read_csv("holidays.csv", dtype={'column_name': 'str'})

In [2]:
df.head()

Unnamed: 0,country,date_parsed,weekday,day,month,name_of_holiday,date_of_holiday,year_of_event,independence_from,event_commemorated_and_notes,year,date_mdy
0,Afghanistan,1919-08-19,Tuesday,19.0,Aug,Afghan Independence Day (Afghan Victory Day),August 19,1919.0,United Kingdom,Anglo-Afghan Treaty of 1919 or Treaty of Rawal...,1919.0,"August 19, 1919"
1,Albania,1912-11-28,Thursday,28.0,Nov,Flag Day (Dita e Flamurit),November 28,1912.0,Ottoman Empire,Albanian Declaration of Independence. The foll...,1912.0,"November 28, 1912"
2,Algeria,1962-07-05,Thursday,5.0,Jul,Independence Day,July 5,1962.0,France,Algeria gained independence following the Alge...,1962.0,"July 5, 1962"
3,Andorra,,,,,,,,,,,
4,Angola,1975-11-11,Tuesday,11.0,Nov,Independence Day,November 11,1975.0,Portugal,"The Alvor Agreement, signed on 15 January 1975...",1975.0,"November 11, 1975"


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       216 non-null    object 
 1   date_parsed                   189 non-null    object 
 2   weekday                       189 non-null    object 
 3   day                           189 non-null    float64
 4   month                         189 non-null    object 
 5   name_of_holiday               150 non-null    object 
 6   date_of_holiday               189 non-null    object 
 7   year_of_event                 174 non-null    float64
 8   independence_from             181 non-null    object 
 9   event_commemorated_and_notes  145 non-null    object 
 10  year                          189 non-null    float64
 11  date_mdy                      189 non-null    object 
dtypes: float64(3), object(9)
memory usage: 20.4+ KB


Too many columns and a lot of repetition. Dropping the following for simplicity. 

In [4]:
df = df.drop(columns=['weekday', 'day', 'date_mdy', 'year_of_event', 'event_commemorated_and_notes', 'year'])

df.tail()

Unnamed: 0,country,date_parsed,month,name_of_holiday,date_of_holiday,independence_from
211,Venezuela,1811-07-05,Jul,Independence Day,July 5,Spanish Empire
212,Vietnam,1945-09-02,Sep,Independence Day,September 2,Empire of Japan and France
213,Yemen,1967-11-30,Nov,,November 30,United Kingdom
214,Zambia,1964-10-24,Oct,Independence Day,October 24,United Kingdom
215,Zimbabwe,1980-04-18,Apr,Independence Day,April 18,United Kingdom


In [5]:
#Two instances of China in the set - dropping one 

df = df.drop(42, axis=0)  

df.reset_index(drop=True, inplace=True)

Then went through and checked each of the rows for completeness. When I met NaNs, I researched and completed the missing data accordingly. I only retained code cells if they were utilised to alter the data. 

In [6]:
# Fixing missing values in Andorra. 
df.loc[3, 'country'] = 'Andorra'
df.loc[3, 'date_parsed'] = '1993-09-08'  
df.loc[3, 'month'] = 'September'  
df.loc[3, 'name_of_holiday'] = 'National Day' 
df.loc[3, 'date_of_holiday'] = 'September 8'  
df.loc[3, 'independence_from'] = 'France'  

In [7]:
# Fixing missing values in Australia. 
df.loc[9, 'country'] = 'Australia'
df.loc[9, 'date_parsed'] = '1808-01-26'  
df.loc[9, 'month'] = 'January'  
df.loc[9, 'name_of_holiday'] = 'Australia Day' 
df.loc[9, 'date_of_holiday'] = 'January 26'  
df.loc[9, 'independence_from'] = 'None'  

In [8]:
# Fixing missing values in Austria.   
df.loc[10, 'independence_from'] = 'Allied occupying powers'  

In [9]:
# Fixing missing values in the Bahamas.   
df.loc[13, 'country'] = 'The Bahamas'  

In [10]:
# Fixing missing values in Bhutan. 
df.loc[21, 'country'] = 'Bhutan'
df.loc[21, 'date_parsed'] = '1907-12-17'  
df.loc[21, 'month'] = 'December'  
df.loc[21, 'name_of_holiday'] = 'National Day' 
df.loc[21, 'date_of_holiday'] = 'December 17'  
df.loc[21, 'independence_from'] = 'None'  

In [11]:
# Fixing missing values in Cameroon. 
df.loc[33, 'date_parsed'] = '1972-05-20'  
df.loc[33, 'month'] = 'May'  
df.loc[33, 'name_of_holiday'] = 'National Day' 
df.loc[33, 'date_of_holiday'] = 'May 20'  
df.loc[33, 'independence_from'] = 'France' 

In [12]:
# Fixing missing values in Canada. 
df.loc[34, 'date_parsed'] = '1867-01-1'  
df.loc[34, 'month'] = 'July'  
df.loc[34, 'name_of_holiday'] = 'Canada Day' 
df.loc[34, 'date_of_holiday'] = 'July 1'  
df.loc[34, 'independence_from'] = 'None'  

In [13]:
# Fixing missing values in China 
df.loc[41, 'date_parsed'] = '1949-10-1'  
df.loc[41, 'month'] = 'October'  
df.loc[41, 'name_of_holiday'] = 'China Day' 
df.loc[41, 'date_of_holiday'] = 'October 1'  
df.loc[41, 'independence_from'] = 'None'  
df.loc[41, 'country'] = 'China' 

In [14]:
df = df.drop(46, axis=0)  # Two instances of Republic of Congo - drop one 
df.reset_index(drop=True, inplace=True)

In [15]:
# Fixing missing values in Denmark 
df.loc[53, 'date_parsed'] = '1953-06-5'  
df.loc[53, 'month'] = 'June'  
df.loc[53, 'name_of_holiday'] = 'Constitution Day' 
df.loc[53, 'date_of_holiday'] = 'June 5'  
df.loc[53, 'independence_from'] = 'None'  

In [16]:
# Dominican Republic

df.loc[56, 'name_of_holiday'] = "Independence Day" 

In [17]:
# East Timor

df.loc[57, 'name_of_holiday'] = "Independence Day" 

In [18]:
# Ecuador

df.loc[59, 'name_of_holiday'] = "Independence Day" 

In [19]:
# Egypt 

df.loc[60, 'date_parsed'] = "1952-07-23" 
df.loc[60, 'month'] = "July" 
df.loc[60, 'name_of_holiday'] = "Revolution Day" 
df.loc[60, 'date_of_holiday'] = "July 23" 
df.loc[60, 'independence_from'] = "None" 

In [20]:
# Equatorial Guinea 

df.loc[62, 'name_of_holiday'] = "Independence Day" 

In [21]:
# Estonia
df.loc[65, 'name_of_holiday'] = 'Day of Restoration of Independence'
df.loc[65, 'independence_from'] = 'Soviet Union'

In [22]:
# Ethiopia 

df.loc[67, 'date_parsed'] = "1941-05-5" 
df.loc[67, 'month'] = "May" 
df.loc[67, 'name_of_holiday'] = "Independence Day" 
df.loc[67, 'date_of_holiday'] = "May 5" 
df.loc[67, 'independence_from'] = "Italy" 

In [23]:
#Gabon
df.loc[71, 'name_of_holiday'] = "Independence Day" 

In [24]:
# Gambia
df.loc[72, 'name_of_holiday'] = "National Day" 

In [25]:
# Germany

df.loc[75, 'date_parsed'] = "1990-10-3" 
df.loc[75, 'month'] = "October" 
df.loc[75, 'name_of_holiday'] = "Unity Day" 
df.loc[75, 'date_of_holiday'] = "October 3" 
df.loc[75, 'independence_from'] = "None" 

In [26]:
# France

df.loc[70, 'independence_from'] = "None" 

In [27]:
# Greece

df.loc[77, 'name_of_holiday'] = "Independence Day" 

In [28]:
# Guinea

df.loc[80, 'name_of_holiday'] = "Independence Day" 

In [29]:
# Guyana

df.loc[82, 'name_of_holiday'] = "Independence Day" 

In [30]:
# Hungary 

df.loc[85, 'independence_from'] = "Soviet Union" 
df.loc[85, 'date_of_holiday'] = "June 19" 

In [31]:
# India

df.loc[87, 'independence_from'] = "United Kingdom"  

In [32]:
# Iran

df.loc[90, 'independence_from'] = "None" 
df.loc[90, 'name_of_holiday'] = "Islamic Republic Day" 

In [33]:
# Iraq

df.loc[91, 'name_of_holiday'] = "National Day" 

In [34]:
# Ireland

df.loc[92, 'independence_from'] = "United Kingdom" 
df.loc[92, 'name_of_holiday'] = "Republic Day" 

In [35]:
# Israel

df.loc[93, 'independence_from'] = "United Kingdom" 

In [36]:
# Italy

df.loc[94, 'date_parsed'] = "1947-06-2" 
df.loc[94, 'month'] = "June" 
df.loc[94, 'name_of_holiday'] = "Republic Day" 
df.loc[94, 'date_of_holiday'] = "June 2" 
df.loc[94, 'independence_from'] = "None" 

In [37]:
# Ivory Coast

df.loc[95, 'name_of_holiday'] = "Independence Day" 

In [38]:
# Jamaica

df.loc[96, 'name_of_holiday'] = "Independence Day" 

In [39]:
# Japan

df.loc[97, 'date_parsed'] = "1966-02-11" 
df.loc[97, 'month'] = "February" 
df.loc[97, 'name_of_holiday'] = "Foundation Day" 
df.loc[97, 'independence_from'] = "None" 

In [40]:
# Kiribati

df.loc[101, 'name_of_holiday'] = "National Day"

In [41]:
# Kuwait

df.loc[105, 'name_of_holiday'] = "National Day"

In [42]:
# Laos

df.loc[107, 'date_parsed'] = "1975-12-2" 
df.loc[107, 'month'] = "December" 
df.loc[107, 'name_of_holiday'] = "National Day" 
df.loc[107, 'date_of_holiday'] = "December 2" 
df.loc[107, 'independence_from'] = "None" 

In [43]:
# Latvia

df.loc[109, 'name_of_holiday'] = "Independence Day" 

In [44]:
# Lesotho

df.loc[111, 'name_of_holiday'] = "Independence Day" 

In [45]:
# Libya
 
df.loc[113, 'name_of_holiday'] = "Independence Day" 

In [46]:
# Lichtenstein

df.loc[114, 'date_parsed'] = "1940-08-15" 
df.loc[114, 'month'] = "August" 
df.loc[114, 'name_of_holiday'] = "National Day" 
df.loc[114, 'date_of_holiday'] = "August 15" 
df.loc[114, 'independence_from'] = "None" 

In [47]:
# Luxembourg

df.loc[117, 'date_parsed'] = "1961-06-23" 
df.loc[117, 'month'] = "June" 
df.loc[117, 'name_of_holiday'] = "National Day" 
df.loc[117, 'date_of_holiday'] = "June 23" 
df.loc[117, 'independence_from'] = "None" 

In [48]:
# Madagascar

df.loc[118, 'name_of_holiday'] = "National Day" 

In [49]:
# Malawi

df.loc[119, 'name_of_holiday'] = "Malawi Day" 

In [50]:
# Maldives

df.loc[121, 'name_of_holiday'] = "National Day" 

In [51]:
# Mali

df.loc[122, 'name_of_holiday'] = "Republic Day" 

In [52]:
# Marshall Islands

df.loc[124, 'date_parsed'] = "1979-05-1" 
df.loc[124, 'month'] = "May" 
df.loc[124, 'name_of_holiday'] = "Constitution Day" 
df.loc[124, 'date_of_holiday'] = "May 1" 
df.loc[124, 'independence_from'] = "America" 

In [53]:
# Mauritania
 
df.loc[125, 'name_of_holiday'] = "Independence Day" 

In [54]:
# Mauritania
 
df.loc[126, 'name_of_holiday'] = "Mauritius Day" 

In [55]:
# Monaco

df.loc[130, 'date_parsed'] = "1979-11-19" 
df.loc[130, 'month'] = "November" 
df.loc[130, 'name_of_holiday'] = "National Day" 
df.loc[130, 'date_of_holiday'] = "November 19" 
df.loc[130, 'independence_from'] = "None" 

In [56]:
# Montenegro
 
df.loc[132, 'name_of_holiday'] = "Independence Day" 

In [57]:
# Morocco

df.loc[133, 'independence_from'] = "Spain" 

In [58]:
# Namibia

df.loc[137, 'name_of_holiday'] = "Independence Day"

In [59]:
# Nauru

df.loc[138, 'name_of_holiday'] = "Independence Day"
df.loc[138, 'independence_from'] = "Australia"

In [60]:
# Nepal

df.loc[139, 'date_parsed'] = "2008-05-28" 
df.loc[139, 'month'] = "May" 
df.loc[139, 'name_of_holiday'] = "Republic Day" 
df.loc[139, 'date_of_holiday'] = "May 28" 
df.loc[139, 'independence_from'] = "None" 

In [61]:
# Netherlands

df.loc[140, 'country'] = "Netherlands" 
df.loc[140, 'name_of_holiday'] = "King's Day" 
df.loc[140, 'independence_from'] = "Spain" 

In [62]:
# New Zealand

df.loc[141, 'date_parsed'] = "1974-02-6" 
df.loc[141, 'month'] = "February" 
df.loc[141, 'name_of_holiday'] = "Waitangi Day" 
df.loc[141, 'date_of_holiday'] = "February 6" 
df.loc[141, 'independence_from'] = "None" 

In [63]:
# North Macedonia

df.loc[145, 'name_of_holiday'] = "Independence Day" 

In [64]:
# Palau

df.loc[150, 'date_parsed'] = "1994-10-1" 
df.loc[150, 'month'] = "October" 
df.loc[150, 'name_of_holiday'] = "Independence Day" 
df.loc[150, 'date_of_holiday'] = "October 1" 
df.loc[150, 'independence_from'] = "America" 

In [65]:
# Panama

df.loc[152, 'name_of_holiday'] = "National Day" 
df.loc[153, 'name_of_holiday'] = "National Day" 

In [66]:
# Papua New Guinea 

df.loc[154, 'name_of_holiday'] = "Independence Day" 
df.loc[154, 'independence_from'] = "Australia" 

In [67]:
# Peru

df.loc[156, 'independence_from'] = "Spain" 

In [68]:
# Russia

df.loc[163, 'date_parsed'] = "1992-06-12" 
df.loc[163, 'month'] = "June" 
df.loc[163, 'name_of_holiday'] = "National Day" 
df.loc[163, 'date_of_holiday'] = "June 12" 
df.loc[163, 'independence_from'] = "None" 

In [69]:
# Samoa

df.loc[168, 'name_of_holiday'] = "Independence Day" 

In [70]:
# San Marino

df.loc[169, 'date_parsed'] = "1992-09-3" 
df.loc[169, 'month'] = "September" 
df.loc[169, 'name_of_holiday'] = "Foundation Day" 
df.loc[169, 'date_of_holiday'] = "September 3" 
df.loc[169, 'independence_from'] = "None" 

In [71]:
# Saudi Arabia

df.loc[171, 'independence_from'] = "None" 

In [72]:
# Seychelles

df.loc[174, 'name_of_holiday'] = "National Day"

In [73]:
# Slovakia
 
df.loc[177, 'independence_from'] = "None"
df.loc[177, 'name_of_holiday'] = "National Day"

In [74]:
# Spain

df.loc[183, 'date_parsed'] = "1492-10-12" 
df.loc[183, 'month'] = "October" 
df.loc[183, 'name_of_holiday'] = "National Day" 
df.loc[183, 'date_of_holiday'] = "October 12" 
df.loc[183, 'independence_from'] = "None" 

In [75]:
# Sweden

df.loc[187, 'independence_from'] = "None" 

In [76]:
# Switzerland

df.loc[188, 'independence_from'] = "None" 

In [77]:
# Tanzania 

df.loc[191, 'name_of_holiday'] = "Independence Day" 

In [78]:
# Thailand 

df.loc[192, 'date_parsed'] = "1960-12-5" 
df.loc[192, 'month'] = "December" 
df.loc[192, 'name_of_holiday'] = "National Day" 
df.loc[192, 'date_of_holiday'] = "December 5" 
df.loc[192, 'independence_from'] = "None" 

In [79]:
# Togo 

df.loc[193, 'name_of_holiday'] = "Independence Day" 

In [80]:
# Tonga

df.loc[194, 'name_of_holiday'] = "National Day" 

In [81]:
# Turkey

df.loc[197, 'date_parsed'] = "1923-10-29" 
df.loc[197, 'month'] = "October" 
df.loc[197, 'name_of_holiday'] = "Republic Day" 
df.loc[197, 'date_of_holiday'] = "October 29" 
df.loc[197, 'independence_from'] = "None" 

In [82]:
# Turkmenistan 
 
df.loc[198, 'independence_from'] = "Soviet Union" 

In [83]:
# Vatican City 

df.loc[208, 'date_parsed'] = "1929-03-13" 
df.loc[208, 'month'] = "March" 
df.loc[208, 'name_of_holiday'] = "National Day" 
df.loc[208, 'date_of_holiday'] = "March 13" 
df.loc[208, 'independence_from'] = "None" 

In [84]:
# Vanuatu

df.loc[207, 'independence_from'] = "United Kingdom"

In [85]:
# Yemen

df.loc[211, 'name_of_holiday'] = "National Day" 

In [86]:
# Democratic Republic of the Congo name fix 

df.loc[44, 'country'] = "Democratic Republic of the Congo" 

In [87]:
# Congo name fix 

df.loc[45, 'country'] = "Republic of Congo" 

In [88]:
# Japan fix

df.loc[97, 'date_of_holiday'] = "Feb 12" 

Code to check that all the NaNs have been dealt with above. 

In [89]:
nan_percentage = df.isna().mean() * 100
nan_percentage

country              0.0
date_parsed          0.0
month                0.0
name_of_holiday      0.0
date_of_holiday      0.0
independence_from    0.0
dtype: float64

Converting column types 

In [90]:
# Converting to category to be more efficient
df['month'] = df['month'].astype('category')
df['independence_from'] = df['independence_from'].astype('category')
df['name_of_holiday'] = df['name_of_holiday'].astype('category')

# Verification
print(df.dtypes)

# Couldn't convert dates due to extreme age of San Marino - outside datetime functionality.

country                object
date_parsed            object
month                category
name_of_holiday      category
date_of_holiday        object
independence_from    category
dtype: object


Standardising the month format used throughout dataset. 

In [92]:
month_mapping = {
    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
}

# Using the standardised month mapping to tidy up the dataset. 
df['month'] = df['month'].replace(month_mapping)
print(df['month'].unique())

['Aug', 'Nov', 'Jul', 'Sep', 'May', ..., 'Dec', 'Mar', 'Feb', 'Jun', 'Apr']
Length: 12
Categories (12, object): ['Apr', 'Aug', 'Dec', 'Feb', ..., 'May', 'Nov', 'Oct', 'Sep']


Tidying up names used in the 'independence from' column 

In [93]:
unique_entries = df['independence_from'].unique().tolist()
for entry in unique_entries:
    print(entry)


United Kingdom
Ottoman Empire
France
Portugal
Spanish Empire
Russian Soviet Federative Socialist Republic
Soviet Union
None
Allied occupying powers
Pakistan
Nazi Germany
United Netherlands
Spain
Socialist Federal Republic of Yugoslavia
United Kingdom of Portugal, Brazil and the Algarves
Belgium
SFR Yugoslavia
Austria-Hungary
Czechoslovakia
Ethiopia
Italy
Denmark
Netherlands and Japan
Empire of Japan
Russian Soviet Federative Socialist Republic and German Empire
American Colonization Society
America
United States
Qing China[65][66]
Serbia and Montenegro
France and Spain
South Africa
Australia
Sweden
Israel
Colombia
Spanish Empire[72]
New Zealand
Malaysia
Italy and United Kingdom
Sudan
Egypt and the United Kingdom
Netherlands
Kingdom of Great Britain
United Provinces of the Rio de la Plata
Empire of Japan and France


In [95]:
# Changed some names in the dataset to make it clearer to understand. 
df['independence_from'] = df['independence_from'].replace('Kingdom of Great Britain', 'United Kingdom', regex=False)
df['independence_from'] = df['independence_from'].replace('Empire of Japan and France', 'Japan', regex=False)
df['independence_from'] = df['independence_from'].replace('Spanish Empire[72]', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Italy and United Kingdom', 'Italy', regex=False)
df['independence_from'] = df['independence_from'].replace('SFR Yugoslavia', 'Yugoslavia', regex=False)
df['independence_from'] = df['independence_from'].replace('America', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('United Netherlands', 'Netherlands', regex=False)
df['independence_from'] = df['independence_from'].replace('Spanish Empire', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Empire of Japan', 'Japan', regex=False)
df['independence_from'] = df['independence_from'].replace('Nazi Germany', 'Germany', regex=False)
df['independence_from'] = df['independence_from'].replace('Qing China[65][66]', 'China', regex=False)
df['independence_from'] = df['independence_from'].replace('France and Spain', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Allied occupying powers', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('Netherlands and Japan', 'Netherlands', regex=False)
df['independence_from'] = df['independence_from'].replace('Egypt and the United Kingdom', 'Egypt', regex=False)
df['independence_from'] = df['independence_from'].replace('Serbia and Montenegro', 'Serbia', regex=False)
df['independence_from'] = df['independence_from'].replace('American Colonization Society', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('United Provinces of the Rio de la Plata', 'Portugal', regex=False)
df['independence_from'] = df['independence_from'].replace('Socialist Federal Republic of Yugoslavia', 'Yugoslavia', regex=False)
df['independence_from'] = df['independence_from'].replace('United Kingdom of Portugal, Brazil and the Algarves', 'Portugal', regex=False)
df['independence_from'] = df['independence_from'].replace('Russian Soviet Federative Socialist Republic and German Empire', 'Soviet Union', regex=False)
df['independence_from'] = df['independence_from'].replace('Russian Soviet Federative Socialist Republic', 'Soviet Union', regex=False)
df['independence_from'] = df['independence_from'].replace('Soviet Union[80]', 'Soviet Union', regex=False)

In [96]:
# Exploring the spread of data in independence_from
percentage_counts = df['independence_from'].value_counts(normalize=True) * 100
print(percentage_counts)

independence_from
United Kingdom     26.168224
France             12.616822
None               12.616822
Spain              11.214953
Soviet Union       10.747664
Portugal            4.205607
Ottoman Empire      3.271028
United States       2.336449
Yugoslavia          1.869159
Japan               1.869159
Netherlands         1.401869
Belgium             1.401869
Italy               1.401869
Australia           0.934579
Denmark             0.934579
Germany             0.467290
Austria-Hungary     0.467290
Malaysia            0.467290
Israel              0.467290
Pakistan            0.467290
Ethiopia            0.467290
China               0.467290
Egypt               0.467290
Serbia              0.467290
South Africa        0.467290
Czechoslovakia      0.467290
Sudan               0.467290
Sweden              0.467290
Colombia            0.467290
New Zealand         0.467290
Name: proportion, dtype: float64


The display of the countries independence was gained from was not easy to read on the bar chart. For this reason, I decided to create an additional column to aggregate some of the countries. 

In [97]:
# Created a mapping for the countries to identify as other, then applying the independence_from values to the new column 

countries_to_map = [
    'Pakistan', 'Germany', 'Israel', 'Czechoslovakia', 'Ethiopia', 'Denmark', 'Serbia', 
    'China', 'South Africa', 'Austria-Hungary', 'Yugoslavia', 'Italy', 'Japan', 'Netherlands', 'Sweden', 'Australia', 'Belgium', 'Colombia', 'Egypt', 'Sudan', 'New Zealand', 'Malaysia'
]

df['aggregate_colonisers'] = df['independence_from'].apply(lambda x: 'Other' if x in countries_to_map else x)

In [98]:
# Quick check of the result.

percentage_counts = df['aggregate_colonisers'].value_counts(normalize=True) * 100
print(percentage_counts)

aggregate_colonisers
United Kingdom    26.168224
Other             16.822430
France            12.616822
None              12.616822
Spain             11.214953
Soviet Union      10.747664
Portugal           4.205607
Ottoman Empire     3.271028
United States      2.336449
Name: proportion, dtype: float64


Fixing the holiday names to attempt more standardisation

In [100]:
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Araw ng Kalayaan or Araw ng Kasarinlan)', 'Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('State Independence Day of Romania(Ziua Independenței de Stat a României)', 'State Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('National Day or Bastille Day', 'Bastille Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Grito de DoloresDía de la Independéncia', 'Día de la Independéncia', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day or Fourth of July', 'Fourth of July', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Flag Day (Dita e Flamurit)', 'Flag Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Święto Niepodległości)', 'Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Day of Hungarian Freedom (A magyar szabadság napja)', 'Day of Hungarian Freedom', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Afghan Independence Day (Afghan Victory Day)', 'Afghan Victory Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence DayDia da Independência Nacional', 'Dia da Independência Nacional', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Commencement of the Wars of Independence[46]', 'Commencement of the Wars of Independence', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Dia da Independência)', 'Dia da Independência', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Hari Kemerdekaan)', 'Hari Kemerdekaan', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Youm-e-Azadi)', 'Youm-e-Azadi', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (part of Fiestas Patrias)', 'Fiestas Patrias', regex=False)

In [101]:
# Exploring the spread of data in independence_from
percentage_counts = df['name_of_holiday'].value_counts(normalize=True) * 100
print(percentage_counts)

name_of_holiday
Independence Day                              53.271028
National Day                                  16.355140
Republic Day                                   4.672897
Constitution Day                               1.401869
Statehood Day                                  0.934579
Fiestas Patrias                                0.934579
Proclamation of Independence Day               0.934579
Foundation Day                                 0.934579
Afghan Victory Day                             0.467290
King's Day                                     0.467290
Liberation Day                                 0.467290
Malawi Day                                     0.467290
Mauritius Day                                  0.467290
Bastille Day                                   0.467290
Not a holiday                                  0.467290
Proclamation Day of the Republic of Latvia     0.467290
Restoration Day                                0.467290
Islamic Republic Day            

Another check of data 

In [102]:
df.head()

Unnamed: 0,country,date_parsed,month,name_of_holiday,date_of_holiday,independence_from,aggregate_colonisers
0,Afghanistan,1919-08-19,Aug,Afghan Victory Day,August 19,United Kingdom,United Kingdom
1,Albania,1912-11-28,Nov,Flag Day,November 28,Ottoman Empire,Ottoman Empire
2,Algeria,1962-07-05,Jul,Independence Day,July 5,France,France
3,Andorra,1993-09-08,Sep,National Day,September 8,France,France
4,Angola,1975-11-11,Nov,Independence Day,November 11,Portugal,Portugal


Adding additional rows to the dataset which are present in the GeoJSON. 

In [103]:
# Antarctica
antarctica_row = {
    'country': 'Antarctica',
    'date_parsed': "1959-12-1",
    'name_of_holiday': "Antarctica Day",       
    'date_of_holiday': "December 1",       
    'independence_from': "None",     
    'month': "Dec",                 
    'aggregate_colonisers': "None"   
}

df = pd.concat([df, pd.DataFrame([antarctica_row])], ignore_index=True)

In [104]:
# United Kingdom
uk_row = {
    'country': 'United Kingdom',
    'date_parsed': "1948-06-25",
    'name_of_holiday': "King's Birthday",       
    'date_of_holiday': "June 25",       
    'independence_from': "None",     
    'month': "Jun",                 
    'aggregate_colonisers': "None"   
}

df = pd.concat([df, pd.DataFrame([uk_row])], ignore_index=True)

In [106]:
# Western Sahara
sahara_row = {
    'country': 'Western Sahara',
    'date_parsed': "1976-02-27",
    'name_of_holiday': "Liberation Day",       
    'date_of_holiday': "February 27",       
    'independence_from': "Spain",     
    'month': "Feb",                 
    'aggregate_colonisers': "Spain"   
}

df = pd.concat([df, pd.DataFrame([sahara_row])], ignore_index=True)

In [107]:
# Ivory Coast 
ivorycoast_row = {
    'country': 'Ivory Coast',
    'date_parsed': "1960-08-07",
    'name_of_holiday': "Independence Day",       
    'date_of_holiday': "August 7",       
    'independence_from': "France",     
    'month': "Aug",                 
    'aggregate_colonisers': "France"   
}

df = pd.concat([df, pd.DataFrame([ivorycoast_row])], ignore_index=True)

In [108]:
# French Guiana 
french_row = {
    'country': 'French Guiana',
    'date_parsed': "1960-08-07",
    'name_of_holiday': "Bastille Day",       
    'date_of_holiday': "July 14",       
    'independence_from': "None",     
    'month': "Jul",                 
    'aggregate_colonisers': "None"   
}

df = pd.concat([df, pd.DataFrame([french_row])], ignore_index=True)

In [109]:
# Syrian Arab Republic 
sra_row = {
    'country': 'Syrian Arab Republic',
    'date_parsed': "1946-04-17",
    'name_of_holiday': "Evacuation Day",       
    'date_of_holiday': "April 17",       
    'independence_from': "France",     
    'month': "Apr",                 
    'aggregate_colonisers': "France"   
}

df = pd.concat([df, pd.DataFrame([sra_row])], ignore_index=True)

In [110]:
# Macedonia
macedonia_row = {
    'country': 'Macedonia',
    'date_parsed': "1991-09-08",
    'name_of_holiday': "Independence Day",       
    'date_of_holiday': "September 9",       
    'independence_from': "Yugoslavia",     
    'month': "Sep",                 
    'aggregate_colonisers': "Other"   
}

df = pd.concat([df, pd.DataFrame([macedonia_row])], ignore_index=True)

In [111]:
# Taiwan 
taiwan_row = {
    'country': 'Taiwan',
    'date_parsed': "1911-10-10",
    'name_of_holiday': "Double Ten Day",       
    'date_of_holiday': "October 10",       
    'independence_from': "China",     
    'month': "Oct",                 
    'aggregate_colonisers': "Other"   
}

df = pd.concat([df, pd.DataFrame([taiwan_row])], ignore_index=True)

Adding a 'Season' column for visual display 

In [112]:
# Dictionary to map months to seasons
month_to_season = {
    'Jan': 'Winter', 'Feb': 'Winter', 'Mar': 'Spring',
    'Apr': 'Spring', 'May': 'Spring', 'Jun': 'Summer',
    'Jul': 'Summer', 'Aug': 'Summer', 'Sep': 'Autumn',
    'Oct': 'Autumn', 'Nov': 'Autumn', 'Dec': 'Winter'
}

df['season'] = df['month'].map(month_to_season)
df.head()

Unnamed: 0,country,date_parsed,month,name_of_holiday,date_of_holiday,independence_from,aggregate_colonisers,season
0,Afghanistan,1919-08-19,Aug,Afghan Victory Day,August 19,United Kingdom,United Kingdom,Summer
1,Albania,1912-11-28,Nov,Flag Day,November 28,Ottoman Empire,Ottoman Empire,Autumn
2,Algeria,1962-07-05,Jul,Independence Day,July 5,France,France,Summer
3,Andorra,1993-09-08,Sep,National Day,September 8,France,France,Autumn
4,Angola,1975-11-11,Nov,Independence Day,November 11,Portugal,Portugal,Autumn


Saving cleaned datafile 

In [114]:
df.to_csv('cleaned_holidays.csv')