In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import re

df = pd.read_csv("holidays.csv", dtype={'column_name': 'str'})

In [2]:
df.head()

Unnamed: 0,country,date_parsed,weekday,day,month,name_of_holiday,date_of_holiday,year_of_event,independence_from,event_commemorated_and_notes,year,date_mdy
0,Afghanistan,1919-08-19,Tuesday,19.0,Aug,Afghan Independence Day (Afghan Victory Day),August 19,1919.0,United Kingdom,Anglo-Afghan Treaty of 1919 or Treaty of Rawal...,1919.0,"August 19, 1919"
1,Albania,1912-11-28,Thursday,28.0,Nov,Flag Day (Dita e Flamurit),November 28,1912.0,Ottoman Empire,Albanian Declaration of Independence. The foll...,1912.0,"November 28, 1912"
2,Algeria,1962-07-05,Thursday,5.0,Jul,Independence Day,July 5,1962.0,France,Algeria gained independence following the Alge...,1962.0,"July 5, 1962"
3,Andorra,,,,,,,,,,,
4,Angola,1975-11-11,Tuesday,11.0,Nov,Independence Day,November 11,1975.0,Portugal,"The Alvor Agreement, signed on 15 January 1975...",1975.0,"November 11, 1975"


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       216 non-null    object 
 1   date_parsed                   189 non-null    object 
 2   weekday                       189 non-null    object 
 3   day                           189 non-null    float64
 4   month                         189 non-null    object 
 5   name_of_holiday               150 non-null    object 
 6   date_of_holiday               189 non-null    object 
 7   year_of_event                 174 non-null    float64
 8   independence_from             181 non-null    object 
 9   event_commemorated_and_notes  145 non-null    object 
 10  year                          189 non-null    float64
 11  date_mdy                      189 non-null    object 
dtypes: float64(3), object(9)
memory usage: 20.4+ KB


Too many columns and a lot of repetition. Dropping the following for simplicity. 

In [4]:
df = df.drop(columns=['weekday', 'day', 'date_mdy', 'year_of_event', ])

df.tail()

Unnamed: 0,country,date_parsed,month,name_of_holiday,date_of_holiday,independence_from,event_commemorated_and_notes,year
211,Venezuela,1811-07-05,Jul,Independence Day,July 5,Spanish Empire,Venezuelan Declaration of Independence,1811.0
212,Vietnam,1945-09-02,Sep,Independence Day,September 2,Empire of Japan and France,Proclamation of Independence of the Democratic...,1945.0
213,Yemen,1967-11-30,Nov,,November 30,United Kingdom,Declaration of independence as South Yemen,1967.0
214,Zambia,1964-10-24,Oct,Independence Day,October 24,United Kingdom,Effective date of the Zambia Independence Act ...,1964.0
215,Zimbabwe,1980-04-18,Apr,Independence Day,April 18,United Kingdom,Effective date of the Lancaster House Agreement,1980.0


In [5]:
#Two instances of China in the set - dropping one 

df = df.drop(42, axis=0)  

df.reset_index(drop=True, inplace=True)

Then went through and checked each of the rows for completeness. When I met NaNs, I researched and completed the missing data accordingly. I only retained code cells if they were utilised to alter the data. 

In [6]:
#dropping the bracketed numbers at the end of this column. 

df['event_commemorated_and_notes'] = df['event_commemorated_and_notes'].str.replace(r'\[\d+\]', '', regex=True)
print(df[['event_commemorated_and_notes']].head())

                        event_commemorated_and_notes
0  Anglo-Afghan Treaty of 1919 or Treaty of Rawal...
1  Albanian Declaration of Independence. The foll...
2  Algeria gained independence following the Alge...
3                                                NaN
4  The Alvor Agreement, signed on 15 January 1975...


In [7]:
# Fixing missing values in Andorra. 
df.loc[3, 'country'] = 'Andorra'
df.loc[3, 'date_parsed'] = '1993-09-08'  
df.loc[3, 'month'] = 'September'  
df.loc[3, 'name_of_holiday'] = 'National Day' 
df.loc[3, 'date_of_holiday'] = 'September 8'  
df.loc[3, 'independence_from'] = 'France'  
df.loc[3, 'event_commemorated_and_notes'] = 'Andorra commemorate their patron saint (Lady of Meritxell) and their establishment on this day.' 
df.loc[3, 'year'] = 1993  

In [8]:
# Fixing missing values in Australia. 
df.loc[9, 'country'] = 'Australia'
df.loc[9, 'date_parsed'] = '1808-01-26'  
df.loc[9, 'month'] = 'January'  
df.loc[9, 'name_of_holiday'] = 'Australia Day' 
df.loc[9, 'date_of_holiday'] = 'January 26'  
df.loc[9, 'independence_from'] = 'None'  
df.loc[9, 'event_commemorated_and_notes'] = 'Australia Day marks the landing of the first fleet of ships from Britain in 1788.' 
df.loc[9, 'year'] = 1808

In [9]:
# Fixing missing values in Austria.   
df.loc[10, 'independence_from'] = 'Allied occupying powers'  
df.loc[10, 'event_commemorated_and_notes'] = 'Commemorates the day that Austria signed a declaration of neutrality post World War 2.' 

In [10]:
# Fixing missing values in the Bahamas.   
df.loc[13, 'country'] = 'The Bahamas'  
df.loc[13, 'event_commemorated_and_notes'] = 'Commemorates the day that the Bahamas became a fully independent nation.' 

In [11]:
# Fixing missing values in Belarus 
df.loc[17, 'event_commemorated_and_notes'] = 'Remembers the liberation of Minsk after several years of German occupation in 1944.' 

In [12]:
# Fixing missing values in Bhutan. 
df.loc[21, 'country'] = 'Bhutan'
df.loc[21, 'date_parsed'] = '1907-12-17'  
df.loc[21, 'month'] = 'December'  
df.loc[21, 'name_of_holiday'] = 'National Day' 
df.loc[21, 'date_of_holiday'] = 'December 17'  
df.loc[21, 'independence_from'] = 'None'  
df.loc[21, 'event_commemorated_and_notes'] = 'Commemorates the coronation of the first king of Bhutan in 1907.' 
df.loc[21, 'year'] = 1907

In [13]:
# Brazil 
df.loc[25, 'event_commemorated_and_notes'] = 'Declaration of independence by Pedro I of Brazil'

In [14]:
# Brunei
df.loc[26, 'event_commemorated_and_notes'] = 'Brunei gained independence on January 1, 1984 but the National Day celebrations were delayed until February 23.'

In [15]:
# Fixing missing values in Cameroon. 
df.loc[33, 'date_parsed'] = '1972-05-20'  
df.loc[33, 'month'] = 'May'  
df.loc[33, 'name_of_holiday'] = 'National Day' 
df.loc[33, 'date_of_holiday'] = 'May 20'  
df.loc[33, 'independence_from'] = 'France'  
df.loc[33, 'event_commemorated_and_notes'] = 'Cameroon celebrates the day of their 1972 constitutional referendum.' 
df.loc[33, 'year'] = 1972

In [16]:
# Fixing missing values in Canada. 
df.loc[34, 'date_parsed'] = '1867-01-1'  
df.loc[34, 'month'] = 'July'  
df.loc[34, 'name_of_holiday'] = 'Canada Day' 
df.loc[34, 'date_of_holiday'] = 'July 1'  
df.loc[34, 'independence_from'] = 'None'  
df.loc[34, 'event_commemorated_and_notes'] = 'Commemorates the establishment of the Dominion of Canada, uniting it into one nation.' 
df.loc[34, 'year'] = 1867

In [17]:
# Fixing missing values in China 
df.loc[41, 'date_parsed'] = '1949-10-1'  
df.loc[41, 'month'] = 'October'  
df.loc[41, 'name_of_holiday'] = 'China Day' 
df.loc[41, 'date_of_holiday'] = 'October 1'  
df.loc[41, 'independence_from'] = 'None'  
df.loc[41, 'event_commemorated_and_notes'] = "Celebrates Mao's formal proclamation of the establishment of the nation." 
df.loc[41, 'year'] = 1949

In [18]:
df = df.drop(46, axis=0)  # Two instances of Republic of Congo - drop one 
df.reset_index(drop=True, inplace=True)

In [19]:
# Fixing missing values in Denmark 
df.loc[41, 'date_parsed'] = '1949-10-1'  
df.loc[41, 'month'] = 'October'  
df.loc[41, 'name_of_holiday'] = 'China Day' 
df.loc[41, 'date_of_holiday'] = 'October 1'  
df.loc[41, 'independence_from'] = 'None'  
df.loc[41, 'event_commemorated_and_notes'] = "Celebrates Mao's formal proclamation of the establishment of the nation." 
df.loc[41, 'year'] = 1949

In [20]:
# Fixing missing values in Denmark 
df.loc[53, 'date_parsed'] = '1953-06-5'  
df.loc[53, 'month'] = 'June'  
df.loc[53, 'name_of_holiday'] = 'Constitution Day' 
df.loc[53, 'date_of_holiday'] = 'June 5'  
df.loc[53, 'independence_from'] = 'None'  
df.loc[53, 'event_commemorated_and_notes'] = "This day honours the constitution of Denmark and is not a traditional National Day." 
df.loc[53, 'year'] = 1953

In [21]:
# Djibouti 

df.loc[54, 'event_commemorated_and_notes'] = "Marks the day of declaration of independence from France." 

In [22]:
# Dominica

df.loc[55, 'event_commemorated_and_notes'] = "Marks the day of independence from the UK and also recognition from the USA." 

In [23]:
# Dominican Republic

df.loc[56, 'name_of_holiday'] = "Independence Day" 

In [24]:
# East Timor

df.loc[57, 'name_of_holiday'] = "Independence Day" 

In [25]:
# Ecuador

df.loc[59, 'name_of_holiday'] = "Independence Day" 

In [26]:
# Egypt 

df.loc[60, 'date_parsed'] = "1952-07-23" 
df.loc[60, 'month'] = "July" 
df.loc[60, 'name_of_holiday'] = "Revolution Day" 
df.loc[60, 'date_of_holiday'] = "July 23" 
df.loc[60, 'independence_from'] = "None" 
df.loc[60, 'event_commemorated_and_notes'] = "Anniversary of the Egyptian Revolution, which led to the modern republic of Egypt." 
df.loc[60, 'year'] = 1952

In [27]:
# EG 

df.loc[62, 'name_of_holiday'] = "Independence Day" 
df.loc[62, 'event_commemorated_and_notes'] = 'Commemorates freedom from Spanish colonisation.'

In [28]:
# Eritrea
df.loc[63, 'event_commemorated_and_notes'] = 'Commemorates the Peoples Liberation Front moving into the capital and reinstating independence.'

In [29]:
# Estonia
df.loc[65, 'name_of_holiday'] = 'Day of Restoration of Independence'
df.loc[65, 'independence_from'] = 'Soviet Union'

In [30]:
# Ethiopia 

df.loc[67, 'date_parsed'] = "1941-05-5" 
df.loc[67, 'month'] = "May" 
df.loc[67, 'name_of_holiday'] = "Independence Day" 
df.loc[67, 'date_of_holiday'] = "May 5" 
df.loc[67, 'independence_from'] = "Italy" 
df.loc[67, 'event_commemorated_and_notes'] = "Celebration of regaining independence from Italy, following invasion by Mussolini." 
df.loc[67, 'year'] = 1941

In [31]:
#Gabon
df.loc[71, 'name_of_holiday'] = "Independence Day" 

In [32]:
# Gambia
df.loc[72, 'name_of_holiday'] = "National Day" 

In [33]:
# Georgia 
df.loc[74, 'event_commemorated_and_notes'] = 'Remembers the establishment of Georgia after the Russian Revolution.'

In [34]:
# Germany

df.loc[75, 'date_parsed'] = "1990-10-3" 
df.loc[75, 'month'] = "October" 
df.loc[75, 'name_of_holiday'] = "Unity Day" 
df.loc[75, 'date_of_holiday'] = "October 3" 
df.loc[75, 'independence_from'] = "None" 
df.loc[75, 'year'] = 1990

In [36]:
# France

df.loc[70, 'independence_from'] = "None" 

In [37]:
# Ghana

df.loc[76, 'event_commemorated_and_notes'] = "Day to honour the people of the diaspora and within the country who led Ghana to independence" 

In [41]:
# Greece

df.loc[77, 'name_of_holiday'] = "Independence Day" 

In [43]:
# Grenada

df.loc[78, 'event_commemorated_and_notes'] = "Day to commemorate Grenadan independence from the United Kingdom." 

In [66]:
# Guinea

df.loc[80, 'name_of_holiday'] = "Independence Day" 
df.loc[80, 'event_commemorated_and_notes'] = "Marks achieving full independence from the French colonial empire." 

In [54]:
# Guyana

df.loc[82, 'name_of_holiday'] = "Independence Day" 
df.loc[82, 'event_commemorated_and_notes'] = "Remembers Guyana gaining independence, despite British resistance." 

In [61]:
# Hungary 

df.loc[85, 'independence_from'] = "Soviet Union" 
df.loc[85, 'date_of_holiday'] = "June 19" 

In [70]:
# India

df.loc[87, 'independence_from'] = "United Kingdom" 
df.loc[87, 'event_commemorated_and_notes'] = "Marks the adoptation of the Indian Constitution." 

In [80]:
# Iran

df.loc[90, 'independence_from'] = "None" 
df.loc[90, 'name_of_holiday'] = "Islamic Republic Day" 

In [81]:
# Iraq

df.loc[91, 'event_commemorated_and_notes'] = "Celebrates gaining independence from British rule." 
df.loc[91, 'name_of_holiday'] = "National Day" 

In [85]:
# Ireland

df.loc[92, 'independence_from'] = "United Kingdom" 
df.loc[92, 'name_of_holiday'] = "Republic Day" 

In [87]:
# Israel

df.loc[93, 'independence_from'] = "United Kingdom" 

In [97]:
# Italy

df.loc[94, 'date_parsed'] = "1947-06-2" 
df.loc[94, 'month'] = "June" 
df.loc[94, 'name_of_holiday'] = "Republic Day" 
df.loc[94, 'date_of_holiday'] = "June 2" 
df.loc[94, 'event_commemorated_and_notes'] = "Commemorates the day of the Italian constitutional referendum after World War 2."
df.loc[94, 'independence_from'] = "None" 
df.loc[94, 'year'] = 1947

In [98]:
# Ivory Coast

df.loc[95, 'name_of_holiday'] = "Independence Day" 

In [101]:
# Jamaica

df.loc[96, 'name_of_holiday'] = "Independence Day" 

In [106]:
# Japan

df.loc[97, 'date_parsed'] = "1966-02-11" 
df.loc[97, 'month'] = "February" 
df.loc[97, 'name_of_holiday'] = "Foundation Day" 
df.loc[97, 'date_of_holiday'] = "February 11" 
df.loc[97, 'event_commemorated_and_notes'] = "Celebrates the foundation of Japan and the ascension of the legendary first emperor."
df.loc[97, 'independence_from'] = "None" 
df.loc[97, 'year'] = 1966

In [111]:
# Kazakhstan

df.loc[99, 'event_commemorated_and_notes'] = "Celebrates the day of declaration of state sovereignty."

In [114]:
# Kenya

df.loc[100, 'event_commemorated_and_notes'] = "Official day of Kenya self-ruling, six months after independence was granted."

In [118]:
# Kiribati

df.loc[101, 'name_of_holiday'] = "National Day"
df.loc[101, 'event_commemorated_and_notes'] = "Celebrates the official day of independence from the United Kingdom."

In [121]:
# Kuwait

df.loc[105, 'name_of_holiday'] = "Independence Day"
df.loc[105, 'event_commemorated_and_notes'] = "Commemorates the ascension of Sheikh Abdullah to rule."

In [125]:
# Kyrgyzstan

df.loc[106, 'event_commemorated_and_notes'] = "Commemorates the anniversary of the declaration of independence."

In [128]:
# Laos

df.loc[107, 'date_parsed'] = "1975-12-2" 
df.loc[107, 'month'] = "December" 
df.loc[107, 'name_of_holiday'] = "National Day" 
df.loc[107, 'date_of_holiday'] = "December 2" 
df.loc[107, 'event_commemorated_and_notes'] = "Marks the end of the monarchy and the establishment of a republic."
df.loc[107, 'independence_from'] = "None" 
df.loc[107, 'year'] = 1975

In [132]:
# Latvia

df.loc[109, 'name_of_holiday'] = "Independence Day" 

In [136]:
# Lebanon

df.loc[110, 'event_commemorated_and_notes'] = "Remembers the end of French control after 23 years of rule."

In [140]:
# Lesotho

df.loc[111, 'name_of_holiday'] = "Independence Day" 
df.loc[111, 'event_commemorated_and_notes'] = "Marks the day the nation became a sovereign state after British control."

In [145]:
# Libya
 
df.loc[113, 'name_of_holiday'] = "Independence Day" 

In [148]:
# Lichtenstein

df.loc[114, 'date_parsed'] = "1940-08-15" 
df.loc[114, 'month'] = "August" 
df.loc[114, 'name_of_holiday'] = "National Day" 
df.loc[114, 'date_of_holiday'] = "August 15" 
df.loc[114, 'event_commemorated_and_notes'] = "Marks both the Assumption of Mary and the birth of Prince Franz Josef 2."
df.loc[114, 'independence_from'] = "None" 
df.loc[114, 'year'] = 1940

In [151]:
# Luxembourg

df.loc[117, 'date_parsed'] = "1961-06-23" 
df.loc[117, 'month'] = "June" 
df.loc[117, 'name_of_holiday'] = "National Day" 
df.loc[117, 'date_of_holiday'] = "June 23" 
df.loc[117, 'event_commemorated_and_notes'] = "Marks the Grand Duke's official birthday."
df.loc[117, 'independence_from'] = "None" 
df.loc[117, 'year'] = 1961

In [156]:
# Madagascar

df.loc[118, 'name_of_holiday'] = "National Day" 
df.loc[118, 'event_commemorated_and_notes'] = "Day of celebrating independence obtained after French rule."

In [159]:
# Malawi

df.loc[119, 'name_of_holiday'] = "Independence Day" 
df.loc[119, 'event_commemorated_and_notes'] = "Commmemorates their date of independence from British rule."

In [162]:
# Maldives

df.loc[121, 'name_of_holiday'] = "National Day" 
df.loc[121, 'event_commemorated_and_notes'] = "Marks the victory of Mohamed Thakurufaanu over the Portuguese occupation."

In [165]:
# Mali

df.loc[122, 'name_of_holiday'] = "Republic Day" 
df.loc[122, 'event_commemorated_and_notes'] = "Celebrates the independence of Mali."

In [168]:
# Malta

df.loc[123, 'event_commemorated_and_notes'] = "Remembers Malta achieving independence from Britain after 150 years."

In [171]:
# Marshall Islands

df.loc[124, 'date_parsed'] = "1979-05-1" 
df.loc[124, 'month'] = "May" 
df.loc[124, 'name_of_holiday'] = "Constitution Day" 
df.loc[124, 'date_of_holiday'] = "May 1" 
df.loc[124, 'event_commemorated_and_notes'] = "Day that USA recognised the independence of the Marshall Islands."
df.loc[124, 'independence_from'] = "America" 
df.loc[124, 'year'] = 1979

In [175]:
# Mauritania
 
df.loc[125, 'name_of_holiday'] = "Independence Day" 

In [178]:
# Mauritania
 
df.loc[126, 'name_of_holiday'] = "Mauritius Day" 
df.loc[126, 'event_commemorated_and_notes'] = "Celebrates independence from Britain and the establishment of a republic."

In [182]:
# Monaco

df.loc[130, 'date_parsed'] = "1979-11-19" 
df.loc[130, 'month'] = "November" 
df.loc[130, 'name_of_holiday'] = "National Day" 
df.loc[130, 'date_of_holiday'] = "November 19" 
df.loc[130, 'event_commemorated_and_notes'] = "Is determined by the current reigning prince."
df.loc[130, 'independence_from'] = "None" 
df.loc[130, 'year'] = 1979

In [186]:
# Montenegro
 
df.loc[132, 'name_of_holiday'] = "Independence Day" 

In [189]:
# Morocco

df.loc[133, 'independence_from'] = "Spain"
df.loc[133, 'event_commemorated_and_notes'] = "Celebrates recovery of independence and end of French and Spanish rule." 

In [192]:
# Namibia

df.loc[137, 'name_of_holiday'] = "Independence Day"
df.loc[137, 'event_commemorated_and_notes'] = "Celebrates the founding of Republic of Namibia from South African rule." 

In [195]:
# Nauru

df.loc[138, 'name_of_holiday'] = "Independence Day"
df.loc[138, 'independence_from'] = "Australia"
df.loc[138, 'event_commemorated_and_notes'] = "Celebrates the establishment of a constitution after two years of self-governance." 

In [198]:
# Nepal

df.loc[139, 'date_parsed'] = "2008-05-28" 
df.loc[139, 'month'] = "May" 
df.loc[139, 'name_of_holiday'] = "Republic Day" 
df.loc[139, 'date_of_holiday'] = "May 28" 
df.loc[139, 'event_commemorated_and_notes'] = "Date of the first meeting of the constituent assembly of Nepal."
df.loc[139, 'independence_from'] = "None" 
df.loc[139, 'year'] = 2008

In [325]:
# Netherlands

df.loc[140, 'country'] = "Netherlands" 
df.loc[140, 'name_of_holiday'] = "King's Day" 
df.loc[140, 'independence_from'] = "Spain" 

In [205]:
# New Zealand

df.loc[141, 'date_parsed'] = "1974-02-6" 
df.loc[141, 'month'] = "February" 
df.loc[141, 'name_of_holiday'] = "Waitangi Day" 
df.loc[141, 'date_of_holiday'] = "February 6" 
df.loc[141, 'event_commemorated_and_notes'] = "This marks the date the British sovereignty over the islands began."
df.loc[141, 'independence_from'] = "None" 
df.loc[141, 'year'] = 1974

In [209]:
# Nigeria

df.loc[144, 'event_commemorated_and_notes'] = "Anniversary of declaration of independence from British rule."

In [212]:
# North Macedonia

df.loc[145, 'name_of_holiday'] = "Independence Day" 
df.loc[145, 'event_commemorated_and_notes'] = "After a referendum, they gained independence from Yugoslavia and became a sovereign democracy."

In [216]:
# Oman 

df.loc[148, 'event_commemorated_and_notes'] = "Celebrates the birthday of Sultan Qaboos bin Said, who reigned until 2020."

In [219]:
# Palau

df.loc[150, 'date_parsed'] = "1994-10-1" 
df.loc[150, 'month'] = "October" 
df.loc[150, 'name_of_holiday'] = "Independence Day" 
df.loc[150, 'date_of_holiday'] = "October 1" 
df.loc[150, 'independence_from'] = "America" 
df.loc[150, 'year'] = 1994

In [231]:
# Panama

df.loc[152, 'name_of_holiday'] = "National Day" 
df.loc[153, 'name_of_holiday'] = "National Day" 

In [233]:
# Papua New Guinea 

df.loc[154, 'event_commemorated_and_notes'] = "Day of officially terminating Australian control over the nation."  
df.loc[154, 'name_of_holiday'] = "Independence Day" 
df.loc[154, 'independence_from'] = "Australia" 

In [238]:
# Paraguay

df.loc[155, 'event_commemorated_and_notes'] = "Day of officially proclaiming independence from Spain."  

In [243]:
# Peru

df.loc[156, 'independence_from'] = "Spain" 
df.loc[156, 'event_commemorated_and_notes'] = "Commemorates the day Peruvian generals proclaimed independence." 

In [249]:
# Qatar

df.loc[160, 'event_commemorated_and_notes'] = "Commemorates Qatar being unified, also known as Founders Day." 

In [252]:
# Russia

df.loc[163, 'date_parsed'] = "1992-06-12" 
df.loc[163, 'month'] = "June" 
df.loc[163, 'name_of_holiday'] = "National Day" 
df.loc[163, 'date_of_holiday'] = "June 12" 
df.loc[163, 'event_commemorated_and_notes'] = "Marks the establishment of the present day Russian Federation." 
df.loc[163, 'independence_from'] = "None" 
df.loc[163, 'year'] = 1992

In [256]:
# St Kitts 

df.loc[165, 'event_commemorated_and_notes'] = "Celebrates achieving independence from Britain." 

In [260]:
# St Lucia 

df.loc[166, 'event_commemorated_and_notes'] = "Celebrates achieving independence from Britain." 

In [263]:
# St Vincent 

df.loc[167, 'event_commemorated_and_notes'] = "Celebrates achieving independence from Britain." 

In [274]:
# Samoa

df.loc[168, 'name_of_holiday'] = "Independence Day" 
df.loc[168, 'event_commemorated_and_notes'] = "First pacific island nation to achieve independence from colonialism." 

In [270]:
# San Marino

df.loc[169, 'date_parsed'] = "1992-09-3" 
df.loc[169, 'month'] = "September" 
df.loc[169, 'name_of_holiday'] = "Foundation Day" 
df.loc[169, 'date_of_holiday'] = "September 3" 
df.loc[169, 'event_commemorated_and_notes'] = "Marks the foundation of the republic, which is one of the world's oldest." 
df.loc[169, 'independence_from'] = "None" 
df.loc[169, 'year'] = 301

In [278]:
# Saudi Arabia

df.loc[171, 'independence_from'] = "None" 

In [282]:
# Senegal
 
df.loc[172, 'event_commemorated_and_notes'] = "National Assembly of Senegal proclaimed independence from this on this day." 

In [289]:
# Seychelles

df.loc[174, 'name_of_holiday'] = "National Day"
df.loc[174, 'event_commemorated_and_notes'] = "Official last day of the British flag being flown in the country."

In [293]:
# Sierra Leone
 
df.loc[175, 'event_commemorated_and_notes'] = "Commemorates gaining independence from Britain."

In [301]:
# Slovakia
 
df.loc[177, 'independence_from'] = "None"
df.loc[177, 'name_of_holiday'] = "National Day"

In [305]:
# Solomon Islands 

df.loc[179, 'event_commemorated_and_notes'] = "Day of gaining self-governance and becoming a sovereign nation."

In [310]:
# South Sudan  

df.loc[182, 'event_commemorated_and_notes'] = "After a referendum on the matter, South Sudan split from Sudan to become a sovereign nation."

In [314]:
# Spain

df.loc[183, 'date_parsed'] = "1492-10-12" 
df.loc[183, 'month'] = "October" 
df.loc[183, 'name_of_holiday'] = "National Day" 
df.loc[183, 'date_of_holiday'] = "October 12" 
df.loc[183, 'event_commemorated_and_notes'] = "Commemorates Colombus discovering the Americas in 1492." 
df.loc[183, 'independence_from'] = "None" 
df.loc[183, 'year'] = 1492

In [318]:
# Sri Lanka 

df.loc[184, 'event_commemorated_and_notes'] = "Celebrates gaining political independence from Britain." 

In [321]:
# Sudan

df.loc[185, 'event_commemorated_and_notes'] = "Declaration of independence by the Sudanese parliament." 

In [327]:
# Suriname

df.loc[186, 'event_commemorated_and_notes'] = "Marks the successful completion of independence negotiations with the Netherlands." 

In [330]:
# Sweden

df.loc[187, 'independence_from'] = "None" 

In [334]:
# Switzerland

df.loc[188, 'independence_from'] = "None" 

In [338]:
# Tajikistan 

df.loc[190, 'event_commemorated_and_notes'] = "As the Soviet Union disintegrated, Tajikistan declared itself a sovereign nation." 

In [342]:
# Tanzania 

df.loc[191, 'name_of_holiday'] = "Independence Day" 

In [347]:
# Thailand 

df.loc[192, 'date_parsed'] = "1960-12-5" 
df.loc[192, 'month'] = "December" 
df.loc[192, 'name_of_holiday'] = "National Day" 
df.loc[192, 'date_of_holiday'] = "December 5" 
df.loc[192, 'event_commemorated_and_notes'] = "Commemorates the birth of the late Thai king, their longest reigning monarch." 
df.loc[192, 'independence_from'] = "None" 
df.loc[192, 'year'] = 1960

In [350]:
# Togo 

df.loc[193, 'name_of_holiday'] = "Independence Day" 
df.loc[193, 'event_commemorated_and_notes'] = "Commemorates the country's independence from French-administered UN trusteeship." 

In [355]:
# Tonga

df.loc[194, 'name_of_holiday'] = "National Day" 

In [359]:
# Tunisia 

df.loc[196, 'event_commemorated_and_notes'] = "Celebrates the date of independence from France, becoming a republic a year later."

In [363]:
# Turkey

df.loc[197, 'date_parsed'] = "1923-10-29" 
df.loc[197, 'month'] = "October" 
df.loc[197, 'name_of_holiday'] = "Republic Day" 
df.loc[197, 'date_of_holiday'] = "October 29" 
df.loc[197, 'event_commemorated_and_notes'] = "Marks the proclamation of the republic of Turkey." 
df.loc[197, 'independence_from'] = "None" 
df.loc[197, 'year'] = 1923

In [368]:
# Turkmenistan 
 
df.loc[198, 'event_commemorated_and_notes'] = "After a failed coup, became one of the last Soviet nations to declare sovereignty." 
df.loc[198, 'independence_from'] = "Soviet Union" 

In [371]:
# Tuvalu 
 
df.loc[199, 'event_commemorated_and_notes'] = "The day the nation became a sovereign state in the Commonwealth." 

In [375]:
# Uganda 
 
df.loc[200, 'event_commemorated_and_notes'] = "Celebrates gaining independence from the United Kingdom." 

In [380]:
# UAE 
 
df.loc[203, 'event_commemorated_and_notes'] = "Commemorates the formation of the Emirates and the end of being a British protectorate." 

In [384]:
# Uzbekistan  
 
df.loc[206, 'event_commemorated_and_notes'] = "The date of the official declaration of independence from the (already collapsed) USSR."

In [388]:
# Vatican City 

df.loc[208, 'date_parsed'] = "1929-03-13" 
df.loc[208, 'month'] = "March" 
df.loc[208, 'name_of_holiday'] = "National Day" 
df.loc[208, 'date_of_holiday'] = "March 13" 
df.loc[208, 'event_commemorated_and_notes'] = "The Holy See's national holiday." 
df.loc[208, 'independence_from'] = "None" 
df.loc[208, 'year'] = 1929

In [393]:
# Vanuatu

df.loc[207, 'event_commemorated_and_notes'] = "Celebrates exiting rule of Britain." 
df.loc[207, 'independence_from'] = "United Kingdom"

In [397]:
# Yemen

df.loc[211, 'name_of_holiday'] = "National Day" 

Code to check that all the NaNs have been dealt with above. 

In [400]:
nan_percentage = df.isna().mean() * 100
nan_percentage

country                         0.0
date_parsed                     0.0
month                           0.0
name_of_holiday                 0.0
date_of_holiday                 0.0
independence_from               0.0
event_commemorated_and_notes    0.0
year                            0.0
dtype: float64

Converting column types 

In [406]:
# Converting to category to be more efficient
df['month'] = df['month'].astype('category')
df['independence_from'] = df['independence_from'].astype('category')
df['name_of_holiday'] = df['name_of_holiday'].astype('category')

# Changing 'year' from float to integer
df['year'] = df['year'].astype(int)

# Verification
print(df.dtypes)

# Couldn't convert dates due to extreme age of San Marino - outside datetime functionality.

country                           object
date_parsed                       object
month                           category
name_of_holiday                 category
date_of_holiday                   object
independence_from               category
event_commemorated_and_notes      object
year                               int64
dtype: object


Standardising the month format used throughout dataset. 

In [409]:
month_mapping = {
    'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
    'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
    'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
}

# Using the standardised month mapping to tidy up the dataset. 
df['month'] = df['month'].replace(month_mapping)
print(df['month'].unique())

['August', 'November', 'July', 'September', 'May', ..., 'December', 'March', 'February', 'June', 'April']
Length: 12
Categories (12, object): ['April', 'August', 'December', 'February', ..., 'May', 'November', 'October', 'September']


Tidying up names used in the 'independence from' column 

In [413]:
unique_entries = df['independence_from'].unique().tolist()
for entry in unique_entries:
    print(entry)


United Kingdom
Ottoman Empire
France
Portugal
Spanish Empire
Russian Soviet Federative Socialist Republic
Soviet Union
None
Allied occupying powers
Pakistan
Nazi Germany
United Netherlands
Spain
Socialist Federal Republic of Yugoslavia
United Kingdom of Portugal, Brazil and the Algarves
Belgium
SFR Yugoslavia
Austria-Hungary
Czechoslovakia
Ethiopia
Italy
Denmark
Netherlands and Japan
Empire of Japan
Russian Soviet Federative Socialist Republic and German Empire
American Colonization Society
America
United States
Qing China[65][66]
Serbia and Montenegro
France and Spain
South Africa
Australia
Sweden
Israel
Colombia
Spanish Empire[72]
New Zealand
Malaysia
Italy and United Kingdom
Sudan
Egypt and the United Kingdom
Netherlands
Kingdom of Great Britain
United Provinces of the Rio de la Plata
Empire of Japan and France


In [458]:
# Changed some names in the dataset to make it clearer to understand. 
df['independence_from'] = df['independence_from'].replace('Kingdom of Great Britain', 'United Kingdom', regex=False)
df['independence_from'] = df['independence_from'].replace('Empire of Japan and France', 'Japan', regex=False)
df['independence_from'] = df['independence_from'].replace('Spanish Empire[72]', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Italy and United Kingdom', 'Italy', regex=False)
df['independence_from'] = df['independence_from'].replace('SFR Yugoslavia', 'Yugoslavia', regex=False)
df['independence_from'] = df['independence_from'].replace('America', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('United Netherlands', 'Netherlands', regex=False)
df['independence_from'] = df['independence_from'].replace('Spanish Empire', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Empire of Japan', 'Japan', regex=False)
df['independence_from'] = df['independence_from'].replace('Nazi Germany', 'Germany', regex=False)
df['independence_from'] = df['independence_from'].replace('Qing China[65][66]', 'China', regex=False)
df['independence_from'] = df['independence_from'].replace('France and Spain', 'Spain', regex=False)
df['independence_from'] = df['independence_from'].replace('Allied occupying powers', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('Netherlands and Japan', 'Netherlands', regex=False)
df['independence_from'] = df['independence_from'].replace('Egypt and the United Kingdom', 'Egypt', regex=False)
df['independence_from'] = df['independence_from'].replace('Serbia and Montenegro', 'Serbia', regex=False)
df['independence_from'] = df['independence_from'].replace('American Colonization Society', 'United States', regex=False)
df['independence_from'] = df['independence_from'].replace('United Provinces of the Rio de la Plata', 'Portugal', regex=False)
df['independence_from'] = df['independence_from'].replace('Socialist Federal Republic of Yugoslavia', 'Yugoslavia', regex=False)
df['independence_from'] = df['independence_from'].replace('United Kingdom of Portugal, Brazil and the Algarves', 'Portugal', regex=False)
df['independence_from'] = df['independence_from'].replace('Russian Soviet Federative Socialist Republic and German Empire', 'Soviet Union', regex=False)
df['independence_from'] = df['independence_from'].replace('Russian Soviet Federative Socialist Republic', 'Soviet Union', regex=False)

In [459]:
# Exploring the spread of data in independence_from
percentage_counts = df['independence_from'].value_counts(normalize=True) * 100
print(percentage_counts)

independence_from
United Kingdom     25.700935
None               13.084112
France             12.616822
Spain              11.214953
Soviet Union       10.747664
Portugal            4.205607
Ottoman Empire      3.271028
United States       2.336449
Yugoslavia          1.869159
Japan               1.869159
Netherlands         1.401869
Belgium             1.401869
Italy               1.401869
Australia           0.934579
Denmark             0.934579
Germany             0.467290
Austria-Hungary     0.467290
Malaysia            0.467290
Israel              0.467290
Pakistan            0.467290
Ethiopia            0.467290
China               0.467290
Egypt               0.467290
Serbia              0.467290
South Africa        0.467290
Czechoslovakia      0.467290
Sudan               0.467290
Sweden              0.467290
Colombia            0.467290
New Zealand         0.467290
Name: proportion, dtype: float64


Fixing the holiday names to attempt more standardisation

In [491]:
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Araw ng Kalayaan or Araw ng Kasarinlan)', 'Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('State Independence Day of Romania(Ziua Independenței de Stat a României)', 'State Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('National Day or Bastille Day', 'Bastille Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Grito de DoloresDía de la Independéncia', 'Día de la Independéncia', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day or Fourth of July', 'Fourth of July', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Flag Day (Dita e Flamurit)', 'Flag Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Święto Niepodległości)', 'Independence Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Day of Hungarian Freedom (A magyar szabadság napja)', 'Day of Hungarian Freedom', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Afghan Independence Day (Afghan Victory Day)', 'Afghan Victory Day', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence DayDia da Independência Nacional', 'Dia da Independência Nacional', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Commencement of the Wars of Independence[46]', 'Commencement of the Wars of Independence', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Dia da Independência)', 'Dia da Independência', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Hari Kemerdekaan)', 'Hari Kemerdekaan', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (Youm-e-Azadi)', 'Youm-e-Azadi', regex=False)
df['name_of_holiday'] = df['name_of_holiday'].replace('Independence Day (part of Fiestas Patrias)', 'Fiestas Patrias', regex=False)

In [492]:
# Exploring the spread of data in independence_from
percentage_counts = df['name_of_holiday'].value_counts(normalize=True) * 100
print(percentage_counts)

name_of_holiday
Independence Day                              54.672897
National Day                                  15.420561
Republic Day                                   4.205607
Constitution Day                               1.401869
Statehood Day                                  0.934579
Fiestas Patrias                                0.934579
Islamic Republic Day                           0.934579
Proclamation of Independence Day               0.934579
Foundation Day                                 0.934579
Afghan Victory Day                             0.467290
King's Day                                     0.467290
Liberation Day                                 0.467290
Mauritius Day                                  0.467290
Bastille Day                                   0.467290
Not a holiday                                  0.467290
Proclamation Day of the Republic of Latvia     0.467290
Restoration of Independence Day                0.467290
Restoration Day                 

Final check of data 

In [494]:
df.head()

Unnamed: 0,country,date_parsed,month,name_of_holiday,date_of_holiday,independence_from,event_commemorated_and_notes,year
0,Afghanistan,1919-08-19,August,Afghan Victory Day,August 19,United Kingdom,Anglo-Afghan Treaty of 1919 or Treaty of Rawal...,1919
1,Albania,1912-11-28,November,Flag Day,November 28,Ottoman Empire,Albanian Declaration of Independence. The foll...,1912
2,Algeria,1962-07-05,July,Independence Day,July 5,France,Algeria gained independence following the Alge...,1962
3,Andorra,1993-09-08,September,National Day,September 8,France,Andorra commemorate their patron saint (Lady o...,1993
4,Angola,1975-11-11,November,Independence Day,November 11,Portugal,"The Alvor Agreement, signed on 15 January 1975...",1975


Saving cleaned datafile 

In [497]:
df.to_csv('cleaned_holidays.csv')