In [1]:
# Dependencies
import pandas as pd
import csv, os

In [2]:
# File paths
data_path_1 = "Resources/WH_2015.csv"
data_path_2 = "Resources/WH_2016.csv"
data_path_3 = "Resources/WH_2017.csv"
data_path_4 = "Resources/WH_2018.csv"

In [3]:
# Read CSV
wh_2015_db = pd.read_csv(data_path_1)
wh_2016_db = pd.read_csv(data_path_2)
wh_2017_db = pd.read_csv(data_path_3)
wh_2018_db = pd.read_csv(data_path_4)

In [4]:
# Drop unwanted columns and merge
wh_2015_v1 = wh_2015_db.loc[:, ["Country",
                                "Happiness Rank",
                                "Happiness Score"]]

wh_2016_v1 = wh_2016_db.loc[:, ["Country",
                                "Happiness Rank",
                                "Happiness Score"]]

combined_df =  pd.merge(wh_2015_v1,wh_2016_v1, on = ["Country"], how = "outer")

In [5]:
# Rename columns
combined_df = combined_df.rename(columns = {"Happiness Rank_x": "Rank_2015",
                                            "Happiness Score_x": "Score_2015",
                                            "Happiness Rank_y": "Rank_2016",
                                            "Happiness Score_y":"Score_2016"})

combined_df.head()

Unnamed: 0,Country,Rank_2015,Score_2015,Rank_2016,Score_2016
0,Switzerland,1.0,7.587,2.0,7.509
1,Iceland,2.0,7.561,3.0,7.501
2,Denmark,3.0,7.527,1.0,7.526
3,Norway,4.0,7.522,4.0,7.498
4,Canada,5.0,7.427,6.0,7.404


In [6]:
# Drop unwanted columns and merge
wh_2017_v1 = wh_2017_db.loc[:, ["Country",
                                "Happiness.Rank",
                                "Happiness.Score"]]

combined_df =  pd.merge(combined_df,wh_2017_v1, on = ["Country"], how = "outer")
combined_df

Unnamed: 0,Country,Rank_2015,Score_2015,Rank_2016,Score_2016,Happiness.Rank,Happiness.Score
0,Switzerland,1.0,7.587,2.0,7.509,4.0,7.494
1,Iceland,2.0,7.561,3.0,7.501,3.0,7.504
2,Denmark,3.0,7.527,1.0,7.526,2.0,7.522
3,Norway,4.0,7.522,4.0,7.498,1.0,7.537
4,Canada,5.0,7.427,6.0,7.404,7.0,7.316
...,...,...,...,...,...,...,...
161,Somaliland Region,,,97.0,5.057,,
162,Namibia,,,113.0,4.574,111.0,4.574
163,South Sudan,,,143.0,3.832,147.0,3.591
164,Taiwan Province of China,,,,,33.0,6.422


In [7]:
# Rename columns
combined_df = combined_df.rename(columns = {"Happiness.Rank": "Rank_2017",
                                            "Happiness.Score": "Score_2017"})

combined_df.head()

Unnamed: 0,Country,Rank_2015,Score_2015,Rank_2016,Score_2016,Rank_2017,Score_2017
0,Switzerland,1.0,7.587,2.0,7.509,4.0,7.494
1,Iceland,2.0,7.561,3.0,7.501,3.0,7.504
2,Denmark,3.0,7.527,1.0,7.526,2.0,7.522
3,Norway,4.0,7.522,4.0,7.498,1.0,7.537
4,Canada,5.0,7.427,6.0,7.404,7.0,7.316


In [8]:
# Drop unwanted columns, rename and merge
wh_2018_v1 = wh_2018_db.loc[:, ["Country or region",
                                "Overall rank",
                                "Score"]]

wh_2018_v1 = wh_2018_v1.rename(columns = {"Country or region": "Country",
                                          "Overall rank": "Rank_2018",
                                          "Score": "Score_2018"})

combined_df =  pd.merge(combined_df,wh_2018_v1, on = ["Country"], how = "outer")

# Drop unwanted columns, rename and merge
wh_2019_v1 = wh_2019_db.loc[:, ["Country or region",
                                "Overall rank",
                                "Score"]]

wh_2019_v1 = wh_2019_v1.rename(columns = {"Country or region": "Country",
                                          "Overall rank": "Rank_2019",
                                          "Score": "Score_2019"})

combined_df =  pd.merge(combined_df,wh_2019_v1, on = ["Country"], how = "outer")
combined_df

In [9]:
# Convert Country column name 
combined_df = combined_df.rename(columns = {"Country" : "Country_Name"})
combined_df

Unnamed: 0,Country_Name,Rank_2015,Score_2015,Rank_2016,Score_2016,Rank_2017,Score_2017,Rank_2018,Score_2018
0,Switzerland,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487
1,Iceland,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495
2,Denmark,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555
3,Norway,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594
4,Canada,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328
...,...,...,...,...,...,...,...,...,...
163,South Sudan,,,143.0,3.832,147.0,3.591,154.0,3.254
164,Taiwan Province of China,,,,,33.0,6.422,,
165,"Hong Kong S.A.R., China",,,,,71.0,5.472,,
166,Trinidad & Tobago,,,,,,,38.0,6.192


In [10]:
# Save to CSV
combined_df.to_csv("Resources/cleaned_WH_data.CSV", index = False)

In [11]:
country_df = pd.DataFrame(combined_df["Country_Name"])
country_df

Unnamed: 0,Country_Name
0,Switzerland
1,Iceland
2,Denmark
3,Norway
4,Canada
...,...
163,South Sudan
164,Taiwan Province of China
165,"Hong Kong S.A.R., China"
166,Trinidad & Tobago


In [12]:
volcano_path = "Resources/cleaned_volcano_data.CSV"
volcano_df= pd.read_csv(volcano_path)
volcano_df

Unnamed: 0,Year,Country_Name,Elevation,Total_Deaths,Total_Missing_Persons,Total_Injuries
0,2015,Indonesia,2460,,,
1,2015,Chile,2003,,,
2,2015,Indonesia,1784,,,
3,2015,Papua New Guinea,1807,,,2.0
4,2015,Indonesia,2460,1.0,,
5,2015,New Zealand,1111,1.0,,
6,2016,Indonesia,2460,1.0,,4.0
7,2016,Indonesia,2460,7.0,,3.0
8,2016,United States of America,2805,1.0,,
9,2016,Indonesia,3726,,44.0,


In [13]:
# Test for country column consistency
merged_df = country_df.merge(volcano_df, on="Country_Name", how="outer", indicator=True)
merged_df

Unnamed: 0,Country_Name,Year,Elevation,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge
0,Switzerland,,,,,,left_only
1,Iceland,,,,,,left_only
2,Denmark,,,,,,left_only
3,Norway,,,,,,left_only
4,Canada,,,,,,left_only
...,...,...,...,...,...,...,...
183,United States of America,2016.0,2805.0,1.0,,,right_only
184,United States of America,2018.0,1222.0,1.0,,3.0,right_only
185,United States of America,2018.0,1222.0,,,,right_only
186,Vanuatu,2017.0,1496.0,,,,right_only


In [14]:
merged_df[merged_df["_merge"] == "right_only"]

Unnamed: 0,Country_Name,Year,Elevation,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge
180,Papua New Guinea,2015.0,1807.0,,,2.0,right_only
181,Papua New Guinea,2018.0,365.0,,,,right_only
182,Papua New Guinea,2018.0,365.0,,,,right_only
183,United States of America,2016.0,2805.0,1.0,,,right_only
184,United States of America,2018.0,1222.0,1.0,,3.0,right_only
185,United States of America,2018.0,1222.0,,,,right_only
186,Vanuatu,2017.0,1496.0,,,,right_only
187,Vanuatu,2018.0,1496.0,4.0,,,right_only


In [15]:
country_path = "Resources/UNSD — Methodology.CSV"
country_df = pd.read_csv(country_path, error_bad_lines=False)
country_df



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 67: expected 16 fields, saw 17\nSkipping line 126: expected 16 fields, saw 17\nSkipping line 127: expected 16 fields, saw 17\n'


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed / Developing Countries
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,,Developing
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,,Developing
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,,Developing
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,,Developing
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,,Developing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,1,World,9.0,Oceania,61.0,Polynesia,,,Samoa,882,WS,WSM,,,x,Developing
242,1,World,9.0,Oceania,61.0,Polynesia,,,Tokelau,772,TK,TKL,,,,Developing
243,1,World,9.0,Oceania,61.0,Polynesia,,,Tonga,776,TO,TON,,,x,Developing
244,1,World,9.0,Oceania,61.0,Polynesia,,,Tuvalu,798,TV,TUV,x,,x,Developing


In [16]:
# Dropping unwanted columns
cleaned_country_df = pd.DataFrame(country_df, columns = {"Country or Area", "M49 Code", "ISO-alpha3 Code", "Developed / Developing Countries"})
cleaned_country_df

Unnamed: 0,Country or Area,M49 Code,Developed / Developing Countries,ISO-alpha3 Code
0,Algeria,12,Developing,DZA
1,Egypt,818,Developing,EGY
2,Libya,434,Developing,LBY
3,Morocco,504,Developing,MAR
4,Sudan,729,Developing,SDN
...,...,...,...,...
241,Samoa,882,Developing,WSM
242,Tokelau,772,Developing,TKL
243,Tonga,776,Developing,TON
244,Tuvalu,798,Developing,TUV


In [17]:
# Checking for duplicates
cleaned_country_df.drop_duplicates(subset=['Country or Area'])
cleaned_country_df

Unnamed: 0,Country or Area,M49 Code,Developed / Developing Countries,ISO-alpha3 Code
0,Algeria,12,Developing,DZA
1,Egypt,818,Developing,EGY
2,Libya,434,Developing,LBY
3,Morocco,504,Developing,MAR
4,Sudan,729,Developing,SDN
...,...,...,...,...
241,Samoa,882,Developing,WSM
242,Tokelau,772,Developing,TKL
243,Tonga,776,Developing,TON
244,Tuvalu,798,Developing,TUV


In [18]:
cleaned_country_df = cleaned_country_df.rename(columns = {"Country or Area": "Country_Name",
                                                          "M49 Code":"Country_ID",
                                                          "ISO-alpha3 Code": "Country_Abbreviation",
                                                         "Developed / Developing Countries": "Developed_Or_Developing"})
cleaned_country_df

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation
0,Algeria,12,Developing,DZA
1,Egypt,818,Developing,EGY
2,Libya,434,Developing,LBY
3,Morocco,504,Developing,MAR
4,Sudan,729,Developing,SDN
...,...,...,...,...
241,Samoa,882,Developing,WSM
242,Tokelau,772,Developing,TKL
243,Tonga,776,Developing,TON
244,Tuvalu,798,Developing,TUV


In [19]:
# Save to CSV
cleaned_country_df.to_csv("Resources/cleaned_country_data.CSV", index = False)

In [20]:
# Test for country column consistency
merged_df = cleaned_country_df.merge(volcano_df, on="Country_Name", how="outer", indicator=True)
merged_df

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Elevation,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge
0,Algeria,12,Developing,DZA,,,,,,left_only
1,Egypt,818,Developing,EGY,,,,,,left_only
2,Libya,434,Developing,LBY,,,,,,left_only
3,Morocco,504,Developing,MAR,,,,,,left_only
4,Sudan,729,Developing,SDN,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...
258,Samoa,882,Developing,WSM,,,,,,left_only
259,Tokelau,772,Developing,TKL,,,,,,left_only
260,Tonga,776,Developing,TON,,,,,,left_only
261,Tuvalu,798,Developing,TUV,,,,,,left_only


In [21]:
merged_df[merged_df["_merge"] == "right_only"]

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Elevation,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge


In [22]:
earthquake_path = "Resources/cleaned_worldwide_earthquake_data.CSV"
earthquake_df= pd.read_csv(earthquake_path)
earthquake_df

Unnamed: 0,Year,Magnitude,Country_Name,Total_Deaths,Total_Missing_Persons,Total_Injuries
0,2015,,China,,,11.0
1,2015,6.7,Japan,,,
2,2015,6.4,Vanuatu,,,
3,2015,,China,,,
4,2015,,Bosnia and Herzegovina,4.0,,1.0
...,...,...,...,...,...,...
224,2018,7.5,New Caledonia,,,
225,2018,5.6,Vanuatu,,,
226,2018,5.4,China,,,17.0
227,2018,5.5,Mozambique,,,10.0


In [23]:
merged_df2 = cleaned_country_df.merge(earthquake_df, on="Country_Name", how="outer", indicator=True)
merged_df2

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Magnitude,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge
0,Algeria,12,Developing,DZA,2016.0,5.4,,,28.0,both
1,Egypt,818,Developing,EGY,,,,,,left_only
2,Libya,434,Developing,LBY,,,,,,left_only
3,Morocco,504,Developing,MAR,2016.0,6.3,1.0,,26.0,both
4,Sudan,729,Developing,SDN,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...
413,Samoa,882,Developing,WSM,,,,,,left_only
414,Tokelau,772,Developing,TKL,,,,,,left_only
415,Tonga,776,Developing,TON,,,,,,left_only
416,Tuvalu,798,Developing,TUV,,,,,,left_only


In [24]:
merged_df2[merged_df2["_merge"] == "right_only"]

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Magnitude,Total_Deaths,Total_Missing_Persons,Total_Injuries,_merge


In [25]:
tsunami_path = "Resources/cleaned_tsunami_data.CSV"
tsunami_df= pd.read_csv(tsunami_path)
tsunami_df

Unnamed: 0,Year,Country_Name,Cause,Total_Deaths
0,2015,Japan,Earthquake,
1,2015,Papua New Guinea,Earthquake,
2,2015,Vanuatu,Earthquake,
3,2015,Japan,Earthquake,
4,2015,Papua New Guinea,Earthquake,
...,...,...,...,...
58,2017,New Caledonia,Earthquake,
59,2017,New Caledonia,Earthquake,
60,2018,New Caledonia,Earthquake,
61,2018,Russian Federation,Landslide,


In [26]:
merged_df3 = cleaned_country_df.merge(tsunami_df, on="Country_Name", how="outer", indicator=True)
merged_df3

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Cause,Total_Deaths,_merge
0,Algeria,12,Developing,DZA,,,,left_only
1,Egypt,818,Developing,EGY,,,,left_only
2,Libya,434,Developing,LBY,,,,left_only
3,Morocco,504,Developing,MAR,,,,left_only
4,Sudan,729,Developing,SDN,,,,left_only
...,...,...,...,...,...,...,...,...
278,Samoa,882,Developing,WSM,,,,left_only
279,Tokelau,772,Developing,TKL,,,,left_only
280,Tonga,776,Developing,TON,,,,left_only
281,Tuvalu,798,Developing,TUV,,,,left_only


In [27]:
merged_df3[merged_df3["_merge"] == "right_only"]

Unnamed: 0,Country_Name,Country_ID,Developed_Or_Developing,Country_Abbreviation,Year,Cause,Total_Deaths,_merge


In [28]:
# 