# Happiness/GDP/Alcohol vs. Population Analysis

## Library Imports

In [None]:
#import dependencies 
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
from scipy.stats import linregress

## Importing Happiness, GDP, and Alcohol Consumption Information

In [None]:
data_df=pd.read_csv("Data/HappinessAlcoholConsumption.csv")
data_df.head()

## Dataframe Information
We use this section to look at the information this dataframe holds as well as clean it before use.

In [None]:
data_df = data_df.dropna()

data_df.dtypes

While doing an initial analysis, we noticed that the GDP_PerCapita didn't match what we were seeing elsewhere. There were many countries that had a smaller GDP Per Capita, by a factor of 1000. We realized from our data source, that the data source may have come from somewhere outside of the US where periods are used instead of commas as the thousands separator. Looking at the initial data source, we saw that the other rows did not have a decimal in its value, which further led us to believe that we had an issue with the thousands separator. We found those rows below, multiplied the GDP_PerCapita by 1000 for those having the period and not the others.

In [None]:
+data_df.loc[
    data_df["GDP_PerCapita"] % 1 > 0, :
].head()

In [None]:
for index, row in data_df.iterrows():
    if row["GDP_PerCapita"] % 1 > 0:
        data_df.loc[index, "GDP_PerCapita"] = int(1000 * row["GDP_PerCapita"])
        
data_df.head()

We can see that there are no other rows with incorrect GDP_PerCapita data.

In [None]:
data_df.loc[
    data_df["GDP_PerCapita"] % 1 > 0, :
].head()

In [None]:
#shape of dataframe
data_df.shape

In [None]:
#unique hemispheres and their counts
data_df.groupby("Hemisphere").count()["Country"]

As you can see above, there is a misprint in the data, that "noth" instead of "north." The cells below update the dataframe to fix that issue.

In [None]:
data_df.loc[data_df["Hemisphere"] == 'noth',:]

In [None]:
for index, row in data_df.iterrows():
    if row["Hemisphere"] == "noth":
        data_df.loc[index, "Hemisphere"] = "north"

data_df.head()

As you can see below, there are no more rows with that issue.

In [None]:
data_df.loc[data_df["Hemisphere"] == 'noth',:]

In [None]:
# The updated Hemisphere values are updating.
data_df.groupby("Hemisphere").count()["Country"]

In [None]:
#unique regions
data_df["Region"].unique()

In [None]:
#unique countries
data_df["Country"].unique()

## Importing Population Data

We want to create the GDP Per Capita per regions, but in order to do so, we need to have the population of each country. We found a dataset that has the population of each country, but while our data set above is from 2016, the closest we could find was 2018 for population. The dataset has other features to it, but we're restricting it to the population for that year.

In [None]:
pop_data = pd.read_csv("Data/Population Data/Global Population Trends(2016-2022).csv")
pop_data_2018 = pop_data.loc[pop_data["Year"] == 2018, ["Country", "Total Population"]]
print(pop_data_2018.dtypes)
pop_data_2018.head()

Above, it is clear that the population is not a number as it is an "object" type. This means the values must be turned into numerical values instead. To do this, we remove the commas from the number, then update it to an integer type.

In [None]:
pop_data_2018["Total Population"]=pop_data_2018["Total Population"].str.replace(",","")
pop_data_2018["Total Population"] = pop_data_2018["Total Population"].astype("int64")
pop_data_2018.dtypes

Merging the two dataframes together... I opted to use a right join to see all of the countries in the gdp/happiness/alcohol file.

In [None]:
merged_data = pop_data_2018.merge(data_df,
                                  how = "right",
                                  left_on = "Country",
                                  right_on = "Country",
                                  suffixes = ("_pop", "_data"),
                                  indicator = True
                                 )
merged_data

In the merged file, I wasn't able to show the "Country" column for both dataframes, which is what I wanted to find those countries in one file that didn't have a match in the other file. However, I found the "indicator" argument within pandas.merge() that allowed me to find those.

In the cells below, I plan to programmatically update each dataframe to match the country in the other file with a more appropriate name. I'll show those countries below.

In [None]:
# These are the countries that show in the Happiness / GDP / Alcohol data, but not in Population automatically.
# We'll look for them manually in the Population data.
missed_countries = merged_data.loc[merged_data["_merge"].isin(["right_only"]), "Country"].sort_values().unique()
missed_countries

In [None]:
pop_data_2018.loc[:,"Country"].sort_values().unique()

From the two lists, it's easy to see that the following countries in the happiness dataset have the listed names from the happiness / gdp / alcohol data set, what the corresponding name in the population dataset is, and which table it'll be adjusted:

| 'Happiness / GDP / Alcohol' Name | Population Name | Where to Fix? |
| :------------------------------- | :-------------- | :------------ |
| Bosnia and Herzegovina | Bosnia | Left |
| Cote d'Ivoire | *Missing* | *N/A* |
| Dem. Rep. Congo | Democratic Republic of Congo | Left | 
| Kyrgyzstan | Kyrgyz Republic | Right |
| Macedonia | North Macedonia | Left |
| Rep. Congo | Republic of Congo | Left |
| Russian Federation | Russia | Left |
| Slovakia | Slovak Republic | Right | 
| Syria | Syrian Arab Republic | Right |
| Trinidad and Tobago | Trinidad And Tobago | Right |
| United Arab Emirates | UAE | Right |

In [None]:
left_list = [
    "Bosnia and Herzegovina", "Dem. Rep. Congo", "Kyrgyzstan", "Macedonia", "Rep. Congo",
    "Russian Federation", "Slovakia", "Syria", "Trinidad and Tobago", "United Arab Emirates"
]

right_list = [
    "Bosnia", "Democratic Republic of Congo", "Krygyz Republic", "North Macedonia",
    "Republic of Congo", "Russia", "Slovak Republic", "Syrian Arab Republic",
    "Trinidad And Tobago", "UAE"
]

fix_list = [
    "L", "L", "R", "L", "L", "L", "R", "R", "R", "R"
]

zipped = zip(left_list, right_list, fix_list)

for item in zipped:
    if item[2] == "L":
        merged_data.loc[merged_data["Country"] == item[0], "Country"] = item[1]
    else:
        merged_data.loc[merged_data["Country"] == item[1], "Country"] = item[0]

Dropping the "_merge" column

In [None]:
merged_data = merged_data.drop(columns = "_merge")
merged_data.head()

In [None]:
merged_data.dtypes

In [None]:
merged_data_complete = merged_data.copy()

merged_data_complete["GDP"] = merged_data_complete["Total Population"] * merged_data_complete["GDP_PerCapita"]
merged_data_complete["Beer"] = merged_data_complete["Total Population"] * merged_data_complete["Beer_PerCapita"]
merged_data_complete["Spirit"] = merged_data_complete["Total Population"] * merged_data_complete["Spirit_PerCapita"]
merged_data_complete["Wine"] = merged_data_complete["Total Population"] * merged_data_complete["Wine_PerCapita"]

merged_data_complete

In [None]:
region_data = merged_data_complete.groupby(["Region"])
per_capita_data = region_data[["Total Population", "GDP","Beer", "Spirit", "Wine"]].sum()

per_capita_data["GDP_per"] = per_capita_data["GDP"] / per_capita_data["Total Population"]
per_capita_data["Beer_per"] = per_capita_data["Beer"] / per_capita_data["Total Population"]
per_capita_data["Spirit_per"] = per_capita_data["Spirit"] / per_capita_data["Total Population"]
per_capita_data["Wine_per"] = per_capita_data["Wine"] / per_capita_data["Total Population"]

per_capita_data

In [None]:
per_capita_data_copy = per_capita_data.copy()

per_capita_data_copy = per_capita_data_copy.reset_index()

per_capita_data_copy[["Region", "Beer_per", "Spirit_per", "Wine_per"]]\
    .plot.bar(x = "Region",
              stacked = True,
              title = "Total Alcohol Consumption By Region",
              color = ["goldenrod", "darkred", "slategray"],
              #figsize = (10, 8)
             )
plt.tight_layout()
plt.grid(axis = "y")

plt.savefig("output_data/Total Alcohol Consumption By Region.png")
plt.show()

## GDP Analysis
first by regional averages, then by country

In [None]:
#Analysing GDP
gdp_df=data_df[["Country","Region","GDP_PerCapita"]].copy()
gdp_df

In [None]:
#Average GDP by Region
#***** need to adjust GDP per capita to just GDP (mult by pop of each country) before taking average
plt.figure(figsize=(15, 6))
gdp_df.groupby('Region')['GDP_PerCapita'].mean().plot(kind='bar', color='lightblue')
plt.title('Average GDP Per Capita by Region', fontsize=20)
plt.xlabel('Region')
plt.ylabel('GDP Per Capita')
plt.xticks(rotation=90)
plt.show()

In [None]:
#chart for GDP by Region
gdp=gdp_df["GDP_PerCapita"]
region=gdp_df["Region"]

plt.bar(region,gdp,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=90)

plt.title("GDP by Region")
plt.xlabel("Region")
plt.ylabel("GDP Per Capita")


In [None]:
#top 20 GDP countries
gdp=gdp_df.sort_values(["GDP_PerCapita"],ascending=False)
gdp.head(25)

In [None]:
#chart top 25 countries by GDP
gdp=gdp_df["GDP_PerCapita"]
country=gdp_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = gdp_df.nlargest(25, 'GDP_PerCapita')
top_25_countries.plot(kind='barh', x='Country', y='GDP_PerCapita', color='lightblue', legend=False)
plt.title('GDP Per Capita of Top 25 Countries', fontsize=20)
plt.xlabel('GDP Per Capita')
plt.ylabel('Country')
plt.show()


In [None]:
#bottom 20 GDP countries
gdp=gdp_df.sort_values(["GDP_PerCapita"],ascending=True)
gdp.head(25)

In [None]:
#Bottom 25 countries by GDP
plt.figure(figsize=(15, 10))
smallest_25_countries = gdp_df.nsmallest(25, 'GDP_PerCapita')
smallest_25_countries.plot(kind='barh', x='Country', y='GDP_PerCapita', color='lightcoral', legend=False)
plt.title('Lowest GDP Per Capita of 25 Countries', fontsize=20)
plt.xlabel('GDP Per Capita')
plt.ylabel('Country')
plt.show()

## Human Deveolpment Index (HDI)
first by regional averages, then by country

In [None]:
#Analysing for HDI by Region
hdi_df=data_df[["Country","Region","HDI"]].copy()
hdi_df

In [None]:
#average HDI per region
plt.figure(figsize=(15, 6))
hdi_df.groupby('Region')['HDI'].mean().plot(kind='bar', color='lightblue')
plt.title('Average HDI by Region', fontsize=20)
plt.xlabel('Region')
plt.ylabel('Human Development Index')
plt.xticks(rotation=90)
plt.show()

In [None]:
#HDI per region
hdi=hdi_df["HDI"]
region=gdp_df["Region"]

plt.bar(region,hdi,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=90)

plt.title("HDI by Region")
plt.xlabel("Region")
plt.ylabel("Human Development Index")

In [None]:
#Top 25 HDI Countries
hdi=hdi_df.sort_values(["HDI"],ascending=False)
hdi.head(25)

In [None]:
#Top 25 HDI Countries Graph
hdi=hdi_df["HDI"]
country=hdi_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = hdi_df.nlargest(25, 'HDI')
top_25_countries.plot(kind='barh', x='Country', y='HDI', color='lightblue', legend=False)
plt.title('HDI of Top 25 Countries', fontsize=20)
plt.xlabel('Human Development Index')
plt.ylabel('Country')
plt.show()


In [None]:
#bottom 25 HDI Countries
hdi=hdi_df.sort_values(["HDI"],ascending=True)
hdi.head(25)

In [None]:
#bottom 25 HDI Countries
hdi=hdi_df["HDI"]
country=hdi_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = hdi_df.nsmallest(25, 'HDI')
top_25_countries.plot(kind='barh', x='Country', y='HDI', color='lightblue', legend=False)
plt.title('HDI of Bottom 25 Countries', fontsize=20)
plt.xlabel('Human Development Index')
plt.ylabel('Country')
plt.show()


## Next Section

In [None]:
#Happy Places
#Happiest places (Top 10)
most_happiest_df = data_df.head(10)
most_happiest_df = most_happiest_df [["Country","HappinessScore"]]
most_happiest_df

In [None]:
least_happiest_df = data_df.tail(10)
least_happiest_df= least_happiest_df [["Country","HappinessScore"]]
least_happiest_df

In [None]:
#Happies Regions (Top 10)HappinessScore
happy_region_df = data_df.groupby(["Region"])["HappinessScore"].mean()
happy_region_df 

In [None]:
#Happies Hemisphere
happy_hemisphere_df = data_df.groupby(["Hemisphere"])["HappinessScore"].mean()
happy_hemisphere_df 

In [None]:
#Happiness Level Visualization
#Most top 10 happy country
happy=most_happiest_df["HappinessScore"]
country=most_happiest_df["Country"]

plt.bar(country,happy,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in country]
plt.xticks(tick_loactions,country,rotation=90)

plt.title("Top 10 Happy Countries")
plt.xlabel("Country")
plt.ylabel("Happiness Score")

In [None]:
#Happiness Level Visualization
#Bottow 10 least happy country
happy=least_happiest_df["HappinessScore"]
country=least_happiest_df["Country"]

plt.bar(country,happy,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in country]
plt.xticks(tick_loactions,country,rotation=90)

plt.title("Bottom 10 Least Happy Countries")
plt.xlabel("Country")
plt.ylabel("Happiness Score")

In [None]:
#Happiness level per Region

bar_plot = happy_region_df.plot.bar(width=0.7, zorder=3)

# Set labels for axes
bar_plot.set_xlabel("Region")
bar_plot.set_ylabel("# Average Happines Score")
bar_plot.set_title("Happiness level per Region")


# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher
bar_plot.grid(zorder=0)

In [None]:
#Happiness level per Hemisphere\n",
bar_plot = happy_hemisphere_df.plot.bar(width=0.7, zorder=3)
# Set labels for axes\n",
bar_plot.set_xlabel("Region")
bar_plot.set_ylabel("# Average Happines Score")
bar_plot.set_title("Happiness level per Hemisphere")
# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher\n"
bar_plot.grid(zorder=0)

## Alcohol Consumption and Happiness

In [None]:
OH_happiness = merged_data_complete[["Country", "HappinessScore", "Beer_PerCapita", "Spirit_PerCapita", "Wine_PerCapita"]].copy()
len(OH_happiness)

In [None]:
# function to create create scatter and linear regression plots
def lin_regress(x_vals, y_vals, color, x_ann, y_ann):
    (slope, intercept, rval, pval, stderr) = linregress(x_vals, y_vals)
    #plt.subplots(figsize=(15, 6))
    print(f"The {x_vals.name} r-value is: {rval}")
    regress_vals = x_vals * slope + intercept
    line_eq = f"y = {round(slope,5)} x + {round(intercept,2)}"
    #title = f"Happiness by {x_label}"
    plt.scatter(x_vals, y_vals, c=color)
    plt.plot(x_vals,regress_vals,color)
    plt.annotate(line_eq,(x_ann, y_ann),fontsize=15,color=color)
    plt.title(f"Happiness by Beverage Consumption")
    #plt.xlabel(f"{x_label}")
    plt.xlabel(f"Beverage Consumption per Capita")
    plt.ylabel(f"Happiness")
    plt.ylim(0, y_vals.max()+1)
    #plt.legend()
    plt.savefig(f"output_data/Happiness by Beverage Consumption.png")



In [None]:

plt.figure(figsize=(10, 5))
beer_scat = lin_regress(OH_happiness["Beer_PerCapita"], OH_happiness["HappinessScore"], "goldenrod", 150, 3)
wine_scat = lin_regress(OH_happiness["Wine_PerCapita"], OH_happiness["HappinessScore"], "darkred", 150, 2)
spirit_scat = lin_regress(OH_happiness["Spirit_PerCapita"], OH_happiness["HappinessScore"], "slategray", 150, 1)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

all_bevs_scat = lin_regress(
    (OH_happiness["Beer_PerCapita"]+OH_happiness["Wine_PerCapita"]+OH_happiness["Spirit_PerCapita"]),
    OH_happiness["HappinessScore"], "green", 150, 0)