In [None]:
#import dependencies 
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
from scipy.stats import linregress
#import scipy.stats as st

In [None]:
data_df=pd.read_csv("Data/HappinessAlcoholConsumption.csv")
data_df.head()

# Dataframe Information


In [None]:
data_df = data_df.dropna()

data_df.dtypes

In [None]:
for index, row in data_df.iterrows():
    if row["GDP_PerCapita"] % 1 > 0:
        data_df.loc[index, "GDP_PerCapita"] = 1000 * row["GDP_PerCapita"]
        
data_df.head()

In [None]:
#shape of dataframe
data_df.shape

In [None]:
#unique regions
data_df["Region"].unique()

In [None]:
#unique countries
data_df["Country"].unique()

In [None]:
data_df.columns

# Importing Population Data
for use in averaging GDP per capita

In [None]:
pop_data = pd.read_csv("Data/Population Data/Global Population Trends(2016-2022).csv")
pop_data_2018 = pop_data.loc[pop_data["Year"] == 2018, ["Country", "Total Population"]]
pop_data_2018.head()

In [None]:
pop_data_2018["Total Population"]=pop_data_2018["Total Population"].str.replace(",","")
pop_data_2018["Total Population"] = pop_data_2018["Total Population"].astype("int")
pop_data_2018.dtypes

For both datasets, I looked to see which countries showed up in both datasets, but with different names, e.g. "Republic of Congo" in one set was called "Rep. Congo" in the other. I updated the naming to be consistent in both files to ensure the inner join below would guarantee the maximum number of countries upon joining.

In [None]:
merged_data = pd.merge(pop_data_2018, data_df, how = "inner", left_on = "Country", right_on = "Country")
merged_data

In [None]:
merged_data.dtypes

In [None]:
merged_data_complete = merged_data.copy()

merged_data_complete["GDP"] = merged_data_complete["Total Population"] * merged_data_complete["GDP_PerCapita"]
merged_data_complete["Beer"] = merged_data_complete["Total Population"] * merged_data_complete["Beer_PerCapita"]
merged_data_complete["Spirit"] = merged_data_complete["Total Population"] * merged_data_complete["Spirit_PerCapita"]
merged_data_complete["Wine"] = merged_data_complete["Total Population"] * merged_data_complete["Wine_PerCapita"]

merged_data_complete

In [None]:
region_data = merged_data_complete.groupby(["Region"])
per_capita_data = region_data[["Total Population", "GDP","Beer", "Spirit", "Wine"]].sum()

per_capita_data["GDP_per"] = per_capita_data["GDP"] / per_capita_data["Total Population"]
per_capita_data["Beer_per"] = per_capita_data["Beer"] / per_capita_data["Total Population"]
per_capita_data["Spirit_per"] = per_capita_data["Spirit"] / per_capita_data["Total Population"]
per_capita_data["Wine_per"] = per_capita_data["Wine"] / per_capita_data["Total Population"]

per_capita_data

In [None]:
per_capita_data_copy = per_capita_data.copy()

per_capita_data_copy = per_capita_data_copy.reset_index()

per_capita_data_copy[["Region", "Beer_per", "Spirit_per", "Wine_per"]]\
    .plot.bar(x = "Region",
              stacked = True,
              title = "Total Alcohol Consumption By Region",
              color = ["goldenrod", "darkred", "slategray"]
             )
plt.grid(axis = "y")
# per_capita_data_copy.plot(x = "Region", y = "Wine_per", kind = "bar")
# per_capita_data_copy.plot(x = "Region", y = "Spirit_per", kind = "bar")

plt.show()

# GDP Analysis
first by regional averages, then by country

In [None]:
#Analysing GDP
gdp_df=merged_data_complete[["Country","Region","GDP","Hemisphere"]].copy()
gdp_df

In [None]:
#Average GDP by Region
plt.figure(figsize=(18,12))
gdp_df.groupby('Region')['GDP'].mean().plot(kind='bar', color='turquoise')
plt.title('Average GDP by Region', fontsize=20)
plt.xlabel('Region')
plt.ylabel('GDP')
plt.xticks(rotation=45, ha='right', fontsize=10)

#save figure
plt.savefig("output_data/GDP_HDI/Average_GDP_by_Region.png", bbox_inches='tight')

plt.show()

In [None]:
#chart for GDP by Region
gdp=gdp_df["GDP"]
region=gdp_df["Region"]

plt.bar(region,gdp,color="teal",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=45)

plt.title("GDP by Region")
plt.xlabel("Region")
plt.ylabel("GDP")

plt.savefig("output_data/GDP_HDI/GDP_by_Region.png", bbox_inches='tight')

plt.show()

In [None]:
#chart for GDP by Hemisphere
gdp=gdp_df["GDP"]
region=gdp_df["Hemisphere"]

plt.bar(region,gdp,color="teal",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=90)

plt.title("GDP by Hemisphere")
plt.xlabel("Hemisphere")
plt.ylabel("GDP")

plt.savefig("output_data/GDP_HDI/GDP_by_hemisphere.png", bbox_inches='tight')

plt.show()

In [None]:
#top 20 GDP countries
gdp=gdp_df.sort_values(["GDP"],ascending=False)
gdp.head(25)

In [None]:
#chart top 25 countries by GDP
gdp=gdp_df["GDP"]
country=gdp_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = gdp_df.nlargest(25, 'GDP')
top_25_countries.plot(kind='barh', x='Country', y='GDP', color='paleturquoise', legend=False)
plt.title('Top 25 Countries by GDP', fontsize=20)
plt.xlabel('GDP')
plt.ylabel('Country')

plt.savefig("output_data/GDP_HDI/Top_GDP.png", bbox_inches='tight')
plt.show()


In [None]:
#bottom 20 GDP countries
gdp=gdp_df.sort_values(["GDP"],ascending=True)
gdp.head(25)

In [None]:
#Bottom 25 countries by GDP
plt.figure(figsize=(15, 10))
smallest_25_countries = gdp_df.nsmallest(25, 'GDP')
smallest_25_countries.plot(kind='barh', x='Country', y='GDP', color='teal', legend=False)
plt.title('Bottom 25 Countries by GDP', fontsize=20)
plt.xlabel('GDP')
plt.ylabel('Country')


plt.savefig("output_data/GDP_HDI/Bottom_GDP.png", bbox_inches='tight')
plt.show()

# Human Deveolpment Index (HDI)
first by regional averages, then by country

In [None]:
#Analysing for HDI by Region
hdi_df=data_df[["Country","Region","HDI","Hemisphere"]].copy()
hdi_df

In [None]:
#average HDI per region
plt.figure(figsize=(15, 6))
hdi_df.groupby('Region')['HDI'].mean().plot(kind='bar', color='hotpink')
plt.title('Average HDI by Region', fontsize=20)
plt.xlabel('Region')
plt.ylabel('Human Development Index')
plt.xticks(rotation=45, ha='right')

plt.savefig("output_data/GDP_HDI/Average_HDI_by_Region.png", bbox_inches='tight')
plt.show()

In [None]:
#HDI per region
hdi=hdi_df["HDI"]
region=hdi_df["Region"]

plt.bar(region,hdi,color="hotpink",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=45)

plt.title("HDI by Region")
plt.xlabel("Region")
plt.ylabel("Human Development Index")

plt.savefig("output_data/GDP_HDI/HDI_by_Region.png", bbox_inches='tight')
plt.show()

In [None]:
#chart for HDI by Hemisphere
hdi=hdi_df["HDI"]
region=hdi_df["Hemisphere"]

plt.bar(region,hdi,color="hotpink",alpha=0.5,align="center")
tick_loactions=[value for value in region]
plt.xticks(tick_loactions,region,rotation=90)

plt.title("HDI by Hemisphere")
plt.xlabel("Hemisphere")
plt.ylabel("Human Development Index")

plt.savefig("output_data/GDP_HDI/HDI_by_Hemisphere.png", bbox_inches='tight')
plt.show()

In [None]:
#Top 25 HDI Countries
hdi=hdi_df.sort_values(["HDI"],ascending=False)
hdi.head(25)

In [None]:
#Top 25 HDI Countries Graph
hdi=hdi_df["HDI"]
country=hdi_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = hdi_df.nlargest(25, 'HDI')
top_25_countries.plot(kind='barh', x='Country', y='HDI', color='pink', legend=False)
plt.title('Top 25 Countries by HDI', fontsize=20)
plt.xlabel('Human Development Index')
plt.ylabel('Country')


plt.savefig("output_data/GDP_HDI/Top_HDI.png", bbox_inches='tight')
plt.show()

In [None]:
#bottom 25 HDI Countries
hdi=hdi_df.sort_values(["HDI"],ascending=True)
hdi.head(25)

In [None]:
#bottom 25 HDI Countries
hdi=hdi_df["HDI"]
country=hdi_df["Country"]

plt.figure(figsize=(15, 10))
top_25_countries = hdi_df.nsmallest(25, 'HDI')
top_25_countries.plot(kind='barh', x='Country', y='HDI', color='palevioletred', legend=False)
plt.title('Bottom 25 Countries by HDI', fontsize=20)
plt.xlabel('Human Development Index')
plt.ylabel('Country')


plt.savefig("output_data/GDP_HDI/Bottom_HDI.png", bbox_inches='tight')
plt.show()

In [None]:
#GDP and HDI corrolation
correlation = gdp_df['GDP'].corr(hdi_df['HDI'])

print(f"The correlation coefficient between GDP and HDI is: {correlation}")

## Happy Places

In [None]:
#Happy Places
#Happiest places (Top 10)
most_happiest_df = data_df.head(10)
most_happiest_df = most_happiest_df [["Country","HappinessScore"]]
most_happiest_df

In [None]:
least_happiest_df = data_df.tail(10)
least_happiest_df= least_happiest_df [["Country","HappinessScore"]]
least_happiest_df

In [None]:
#Happies Regions (Top 10)HappinessScore
happy_region_df = data_df.groupby(["Region"])["HappinessScore"].mean()
happy_region_df 

In [None]:
#Happies Hemisphere
happy_hemisphere_df = data_df.groupby(["Hemisphere"])["HappinessScore"].mean()
happy_hemisphere_df 

In [None]:
#Happiness Level Visualization
#Most top 10 happy country
happy=most_happiest_df["HappinessScore"]
country=most_happiest_df["Country"]

plt.bar(country,happy,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in country]
plt.xticks(tick_loactions,country,rotation=90)

plt.title("Top 10 Happy Countries")
plt.xlabel("Country")
plt.ylabel("Happiness Score")

In [None]:
#Happiness Level Visualization
#Bottow 10 least happy country
happy=least_happiest_df["HappinessScore"]
country=least_happiest_df["Country"]

plt.bar(country,happy,color="r",alpha=0.5,align="center")
tick_loactions=[value for value in country]
plt.xticks(tick_loactions,country,rotation=90)

plt.title("Bottom 10 Least Happy Countries")
plt.xlabel("Country")
plt.ylabel("Happiness Score")

In [None]:
#Happiness level per Region

bar_plot = happy_region_df.plot.bar(width=0.7, zorder=3)

# Set labels for axes
bar_plot.set_xlabel("Region")
bar_plot.set_ylabel("# Average Happines Score")
bar_plot.set_title("Happiness level per Region")


# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher
bar_plot.grid(zorder=0)

In [None]:
#Happiness level per Hemisphere\n",
bar_plot = happy_hemisphere_df.plot.bar(width=0.7, zorder=3)
# Set labels for axes\n",
bar_plot.set_xlabel("Region")
bar_plot.set_ylabel("# Average Happines Score")
bar_plot.set_title("Happiness level per Hemisphere")
# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher\n"
bar_plot.grid(zorder=0)

## Alcohol Consumption and Happiness

In [None]:
OH_happiness = merged_data_complete[["Country", "HappinessScore", "Beer_PerCapita", "Spirit_PerCapita", "Wine_PerCapita"]].copy()
len(OH_happiness)

In [None]:
# function to create create scatter and linear regression plots
def lin_regress(x_vals, y_vals, color, x_ann, y_ann):
    (slope, intercept, rval, pval, stderr) = linregress(x_vals, y_vals)
    #plt.subplots(figsize=(15, 6))
    print(f"The {x_vals.name} r-value is: {rval}")
    regress_vals = x_vals * slope + intercept
    line_eq = f"y = {round(slope,5)} x + {round(intercept,2)}"
    #title = f"Happiness by {x_label}"
    plt.scatter(x_vals, y_vals, c=color)
    plt.plot(x_vals,regress_vals,color)
    plt.annotate(line_eq,(x_ann, y_ann),fontsize=15,color=color)
    plt.title(f"Happiness by Beverage Consumption")
    #plt.xlabel(f"{x_label}")
    plt.xlabel(f"Beverage Consumption per Capita")
    plt.ylabel(f"Happiness")
    plt.ylim(0, y_vals.max()+1)
    #plt.legend()
    plt.savefig(f"output_data/Happiness by Beverage Consumption.png")



In [None]:

plt.figure(figsize=(10, 5))
beer_scat = lin_regress(OH_happiness["Beer_PerCapita"], OH_happiness["HappinessScore"], "goldenrod", 150, 3)
wine_scat = lin_regress(OH_happiness["Wine_PerCapita"], OH_happiness["HappinessScore"], "darkred", 150, 2)
spirit_scat = lin_regress(OH_happiness["Spirit_PerCapita"], OH_happiness["HappinessScore"], "slategray", 150, 1)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

all_bevs_scat = lin_regress(
    (OH_happiness["Beer_PerCapita"]+OH_happiness["Wine_PerCapita"]+OH_happiness["Spirit_PerCapita"]),
    OH_happiness["HappinessScore"], "green", 150, 0)