In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
df = pd.read_csv("avg_hours_vs_gdp_unfiltered.csv")

In [None]:
display(df[df['country'] == 'Canada'])

In [None]:
# Compute summary statistics for each region
region_summary = df.groupby("region").agg(
    avg_hours_mean=("average_hours", "mean"),
    avg_hours_median=("average_hours", "median"),
    gdp_mean=("gdp_per_capita", "mean"),
    gdp_median=("gdp_per_capita", "median"),
    gdp_std=("gdp_per_capita", "std")
).reset_index()

# Compute GDP per hour worked
region_summary["gdp_per_hour"] = region_summary["gdp_mean"] / region_summary["avg_hours_mean"]

# Merge correlation into summary table
region_summary = region_summary.merge(correlation_data, on="region")

display(region_summary)

In [None]:
# Calculate correlation coefficients for each region
correlation_data = df.groupby("region").apply(lambda x: stats.pearsonr(x["average_hours"], x["gdp_per_capita"])[0]).reset_index()
correlation_data.columns = ["region", "correlation"]

correlation_data

In [None]:
# Sort data for plotting
gdp_sorted = region_summary.sort_values(by="gdp_median")
hours_sorted = region_summary.sort_values(by="avg_hours_median")
correlation_sorted = correlation_data.sort_values(by="correlation")
gdp_per_hour_sorted = region_summary.sort_values(by="gdp_per_hour")

# Plot 1: GDP per Capita by Region
plt.figure(figsize=(10, 5))
sns.barplot(data=gdp_sorted, x="gdp_median", y="region", palette="coolwarm")
plt.xlabel("GDP per Capita (normalized)")
plt.ylabel("Region")
plt.title("GDP per Capita")
plt.savefig('GDP by Region.png', dpi=300, bbox_inches='tight')
plt.show()



In [None]:
# Plot 2: Average Hours Worked by Region
plt.figure(figsize=(10, 5))
sns.barplot(data=hours_sorted, x="avg_hours_median", y="region", palette="coolwarm")
plt.xlabel("Average Hours Worked, Per Year, Per Person")
plt.ylabel("Region")
plt.title("Average Annual Hours Worked")
plt.savefig('Hours by Region.png', dpi=300, bbox_inches='tight')
plt.show()



In [None]:
# Plot 3: Correlation between Hours Worked and GDP per Capita by Region
plt.figure(figsize=(10, 5))
sns.barplot(data=correlation_sorted, x="correlation", y="region", palette="coolwarm")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Region")
plt.title("Correlation Between Hours Worked and GDP per Capita by Region")
plt.savefig('Correlation between hours gdp.png', dpi=300, bbox_inches='tight')
plt.show()



In [None]:
# Plot GDP per hour worked by region
plt.figure(figsize=(10, 5))
sns.barplot(data=gdp_per_hour_sorted, x="gdp_per_hour", y="region", palette="coolwarm")

# Labels and title
plt.xlabel("GDP per Hour Worked (normalized)")
plt.ylabel("Region")
plt.title("GDP per Hour Worked by Region (Ordered)")
plt.savefig('GDP per hour.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=region_summary, x="region", y="avg_hours_mean", palette="coolwarm")
plt.xticks(rotation=45, ha="right")
plt.xlabel("Region")
plt.ylabel("Average Hours Worked per Year")
plt.title("Average Hours Worked per Year by Region")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig('Average Hours by Region.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Scatter plot of Hours worked vs GDP per capita

# Create a scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(x=df["average_hours"], y=df["gdp_per_capita"], scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Labels and title
plt.xlabel("Average Hours Worked per Year")
plt.ylabel("GDP per Capita (WHR Score)")
plt.title("Relationship Between Hours Worked and GDP per Capita")
plt.grid(True)

# Show the plot
plt.savefig('Scatter plot relationship between hours and gdp.png', dpi=300, bbox_inches='tight')
plt.show()