In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.style.use('seaborn-whitegrid')

# Paths
base_path = "../Data/"
graph_path = "../Graph/"

# Ensure the Graph directory exists
os.makedirs(graph_path, exist_ok=True)

# File paths
industry_group_file = base_path + "Chart 1. Percent of remote workers by major industry group, ranked from largest to smallest in 2021.csv"
productivity_file = base_path + "Chart 3. Relationship between remote work and total factor productivity across 61 industries, 2019–21.csv"
output_changes_file = base_path + "Chart 5. Output and labor input percent changes in the ten industries with the largest gains in remote work, 2019–22.csv"

# Load datasets
industry_group = pd.read_csv(industry_group_file, encoding='latin1')
productivity = pd.read_csv(productivity_file, encoding='latin1')
output_changes = pd.read_csv(output_changes_file, encoding='latin1')

# Question 1: Percentage of Remote Workers by Industry
industry_group.columns = ["Industry", "2019", "2021", "2022"]
plt.figure(figsize=(14, 7))
x = range(len(industry_group["Industry"]))
width = 0.25

plt.bar([i-width for i in x], industry_group["2019"], width, label='2019', color='blue', alpha=0.7)
plt.bar(x, industry_group["2021"], width, label='2021', color='green', alpha=0.7)
plt.bar([i+width for i in x], industry_group["2022"], width, label='2022', color='red', alpha=0.7)

plt.title("Percentage of Remote Workers by Industry (2019, 2021, 2022)")
plt.xticks(x, industry_group["Industry"], rotation=45, ha="right")
plt.ylabel("Percentage of Remote Workers")
plt.xlabel("Industry")
plt.legend(title="Year")
plt.tight_layout()
plt.savefig(graph_path + "percentage_remote_workers_by_industry.png", bbox_inches='tight')
plt.close()

# Question 4: Growth in Remote Work vs. TFP Growth by Industry
plt.figure(figsize=(12, 8), facecolor='white')
scatter = plt.scatter(productivity["RemoteWorkIncrease"], 
                      productivity["ExcessTFPGrowth"], 
                      c=productivity["ExcessTFPGrowth"], 
                      cmap='viridis', 
                      alpha=0.7)
plt.colorbar(scatter, label='Excess TFP Growth')
plt.title("Growth in Remote Work vs. TFP Growth by Industry")
plt.xlabel("Percentage Point Increase in Remote Work")
plt.ylabel("Excess TFP Growth")
plt.grid(True, linestyle='--', alpha=0.7)

# Annotate some interesting points
for _, row in productivity.iterrows():
    if abs(row["ExcessTFPGrowth"]) > 0.5 or abs(row["RemoteWorkIncrease"]) > 5:
        plt.annotate(row["Industry"], 
                     (row["RemoteWorkIncrease"], row["ExcessTFPGrowth"]),
                     xytext=(5, 5), 
                     textcoords='offset points', 
                     fontsize=8)

plt.tight_layout()
plt.savefig(graph_path + "growth_remote_work_vs_tfp.png", bbox_inches='tight')
plt.close()

# Question 5: Output and Labor Input Changes
plt.figure(figsize=(14, 7))
x = range(len(output_changes["Industry"]))
width = 0.35

plt.bar([i-width/2 for i in x], output_changes["OutputChange"], width, 
        color="steelblue", alpha=0.7, label="Output Change")
plt.bar([i+width/2 for i in x], output_changes["LaborInputChange"], width, 
        color="orange", alpha=0.7, label="Labor Input Change")

plt.title("Output and Labor Input Changes in Industries with Most Remote Work Growth (2019-22)")
plt.xticks(x, output_changes["Industry"], rotation=45, ha="right")
plt.ylabel("Percent Change")
plt.xlabel("Industry")
plt.legend(title="Metric")
plt.tight_layout()
plt.savefig(graph_path + "output_labor_changes.png", bbox_inches='tight')
plt.close()



ValueError: Could not interpret value `RemoteWorkIncrease` for parameter `x`

<Figure size 720x432 with 0 Axes>