In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

# Function to load and combine data
def load_data(file_paths):
    dfs = [pd.read_csv(path) for path in file_paths]
    for i, df in enumerate(dfs):
        df['Year'] = 2021 + i
    return pd.concat(dfs, ignore_index=True)

# Load only general payments data
general_data = load_data(['cleaned_dataset/general_2021.csv', 'cleaned_dataset/general_2022.csv', 'cleaned_dataset/general_2023.csv'])

  dfs = [pd.read_csv(path) for path in file_paths]
  dfs = [pd.read_csv(path) for path in file_paths]
  dfs = [pd.read_csv(path) for path in file_paths]


In [None]:
# Function to calculate percentage change
def calculate_percentage_change(current, previous):
    if previous == 0:
        return float('inf') if current > 0 else 0
    return ((current - previous) / previous) * 100
    
def shorten_category_value(value):
    shortenings = {
        "Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program": "Non_Consult_Faculty_Speaker",
        "Compensation for serving as faculty or as a speaker for a medical education program": "Med_Ed_Faculty_Speaker",
        # Add more shortenings as needed
    }
    return shortenings.get(value, value)

def create_line_graph(data, category, category_value, company, output_folder):
    fig, (ax_top, ax_stats, ax_main) = plt.subplots(3, 1, figsize=(15, 12), 
                                                    gridspec_kw={'height_ratios': [0.2, 0.2, 4]}, 
                                                    sharex=True)
    
    # Set custom colors
    male_color = '#0ab1ff'
    female_color = '#ff6161'
    
    # Set style using seaborn
    sns.set_style("whitegrid")
    
    # Prepare data for plotting
    years = [2021, 2022, 2023]
    male_values = []
    female_values = []
    male_counts = []
    female_counts = []
    
    # Collect data for each year
    for year in years:
        year_data = data[data['Year'] == year]
        
        # Get amounts
        male_amount = year_data[year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum()
        female_amount = year_data[year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        male_values.append(male_amount)
        female_values.append(female_amount)
        
        # Get counts
        male_count = year_data[year_data['Gender'] == 'M'].shape[0]
        female_count = year_data[year_data['Gender'] == 'F'].shape[0]
        male_counts.append(male_count)
        female_counts.append(female_count)
    
    # Set title and legend in the top subplot
    shortened_value = shorten_category_value(category_value)
    ax_top.set_title(f"{category} - {shortened_value}", pad=20, fontsize=14)
    ax_top.axis('off')
    ax_top.legend([plt.Line2D([0], [0], color=male_color, lw=2), 
                   plt.Line2D([0], [0], color=female_color, lw=2)], 
                  ['Male', 'Female'], 
                  loc='upper right', frameon=True)
    
    # Add statistics in the middle subplot
    ax_stats.axis('off')
    for i, year in enumerate(years):
        total_count = male_counts[i] + female_counts[i]
        male_percentage = (male_counts[i] / total_count * 100) if total_count > 0 else 0
        female_percentage = (female_counts[i] / total_count * 100) if total_count > 0 else 0
        
        if i > 0:
            male_change = calculate_percentage_change(male_values[i], male_values[i-1])
            female_change = calculate_percentage_change(female_values[i], female_values[i-1])
            
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%) ${male_change:.1f}%\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%) ${female_change:.1f}%\n")
        else:
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%)\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%)\n")

        # Calculate Gender Gap
        count_gap = male_counts[i] - female_counts[i]
        amount_gap = male_values[i] - female_values[i]
        
        # Calculate percentage gaps
        count_gap_percentage = (count_gap / total_count * 100) if total_count > 0 else 0
        amount_gap_percentage = ((male_values[i] - female_values[i]) / (male_values[i] + female_values[i]) * 100) if (male_values[i] + female_values[i]) > 0 else 0
        
        stats_text += f"Gender Gap: {count_gap} ({count_gap_percentage:.1f}%) ($: {amount_gap:.2f}, {amount_gap_percentage:.1f}%)"

        ax_stats.text(i/2, 0.5, stats_text, 
                      horizontalalignment='center', verticalalignment='center',
                      transform=ax_stats.transAxes, fontsize=10)
    
    # Plot lines in the main subplot
    ax_main.plot(years, male_values, marker='o', color=male_color, linewidth=2, markersize=6)
    ax_main.plot(years, female_values, marker='o', color=female_color, linewidth=2, markersize=6)
    
    # Customize the main plot
    ax_main.set_xlabel("Year", labelpad=10)
    ax_main.set_ylabel("Total Amount of Payment (USD)", labelpad=10)
    ax_main.set_xticks(years)
    ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
    ax_main.grid(False, axis='x')
    ax_main.set_ylim(bottom=0)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the plot with tight layout and shortened filename
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"{shortened_value.replace('/', '_')}.png"), 
                bbox_inches='tight', dpi=300)
    plt.close()

# Categories to analyze
categories = ['Covered_Recipient_Type', 'Covered_Recipient_Primary_Type_1', 
              'Nature_of_Payment_or_Transfer_of_Value', 'Form_of_Payment_or_Transfer_of_Value']

# Generate graphs for each company and category
for company in general_data['Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name'].unique():
    company_data = general_data[general_data['Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name'] == company]
    
    for category in categories:
        for category_value in company_data[category].unique():
            category_data = company_data[company_data[category] == category_value]
            output_folder = os.path.join('graphs_general', company, category)
            create_line_graph(category_data, category, category_value, company, output_folder)

print("Graph generation complete for general payments data.")

In [23]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Function to calculate percentage change
def calculate_percentage_change(current, previous):
    if previous == 0:
        return float('inf') if current > 0 else 0
    return ((current - previous) / previous) * 100

def shorten_category_value(value):
    shortenings = {
        "Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program": "Non_Consult_Faculty_Speaker",
        "Compensation for serving as faculty or as a speaker for a medical education program": "Med_Ed_Faculty_Speaker",
        # Add more shortenings as needed
    }
    return shortenings.get(value, value)

def create_line_graph(data, category, category_value, output_folder):
    fig, (ax_top, ax_stats, ax_main) = plt.subplots(3, 1, figsize=(15, 12), 
                                                    gridspec_kw={'height_ratios': [0.2, 0.2, 4]}, 
                                                    sharex=True)
    
    # Set custom colors
    male_color = '#0ab1ff'
    female_color = '#ff6161'
    
    # Set style using seaborn
    sns.set_style("whitegrid")
    
    # Prepare data for plotting
    years = [2021, 2022, 2023]
    male_values = []
    female_values = []
    male_counts = []
    female_counts = []
    
    # Collect data for each year
    for year in years:
        year_data = data[data['Year'] == year]
        
        # Get amounts
        male_amount = year_data[year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum()
        female_amount = year_data[year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        male_values.append(male_amount)
        female_values.append(female_amount)
        
        # Get counts
        male_count = year_data[year_data['Gender'] == 'M'].shape[0]
        female_count = year_data[year_data['Gender'] == 'F'].shape[0]
        male_counts.append(male_count)
        female_counts.append(female_count)
    
    # Set title and legend in the top subplot
    shortened_value = shorten_category_value(category_value)
    ax_top.set_title(f"{category} - {shortened_value}", pad=20, fontsize=14)
    ax_top.axis('off')
    ax_top.legend([plt.Line2D([0], [0], color=male_color, lw=2), 
                   plt.Line2D([0], [0], color=female_color, lw=2)], 
                  ['Male', 'Female'], 
                  loc='upper right', frameon=True)
    
    # Add statistics in the middle subplot
    ax_stats.axis('off')
    for i, year in enumerate(years):
        total_count = male_counts[i] + female_counts[i]
        male_percentage = (male_counts[i] / total_count * 100) if total_count > 0 else 0
        female_percentage = (female_counts[i] / total_count * 100) if total_count > 0 else 0
        
        if i > 0:
            male_change = calculate_percentage_change(male_values[i], male_values[i-1])
            female_change = calculate_percentage_change(female_values[i], female_values[i-1])
            
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%) ${male_change:.1f}%\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%) ${female_change:.1f}%\n")
        else:
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%)\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%)\n")

        # Calculate Gender Gap
        count_gap = male_counts[i] - female_counts[i]
        amount_gap = male_values[i] - female_values[i]
        
        # Calculate percentage gaps
        count_gap_percentage = (count_gap / total_count * 100) if total_count > 0 else 0
        amount_gap_percentage = ((male_values[i] - female_values[i]) / (male_values[i] + female_values[i]) * 100) if (male_values[i] + female_values[i]) > 0 else 0
        
        stats_text += f"Gender Gap: {count_gap} ({count_gap_percentage:.1f}%) ($: {amount_gap:.2f}, {amount_gap_percentage:.1f}%)"

        ax_stats.text(i/2, 0.5, stats_text, 
                      horizontalalignment='center', verticalalignment='center',
                      transform=ax_stats.transAxes, fontsize=10)
    
    # Plot lines in the main subplot
    ax_main.plot(years, male_values, marker='o', color=male_color, linewidth=2, markersize=6)
    ax_main.plot(years, female_values, marker='o', color=female_color, linewidth=2, markersize=6)
    
    # Customize the main plot
    ax_main.set_xlabel("Year", labelpad=10)
    ax_main.set_ylabel("Total Amount of Payment (USD)", labelpad=10)
    ax_main.set_xticks(years)
    ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
    ax_main.grid(False, axis='x')
    ax_main.set_ylim(bottom=0)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the plot with tight layout and shortened filename
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"{shortened_value.replace('/', '_')}.png"), 
                bbox_inches='tight', dpi=300)
    plt.close()

# Categories to analyze
categories = ['Covered_Recipient_Type', 'Covered_Recipient_Primary_Type_1', 
              'Nature_of_Payment_or_Transfer_of_Value', 'Form_of_Payment_or_Transfer_of_Value']

# Generate graphs for each category
for category in categories:
    for category_value in general_data[category].unique():
        category_data = general_data[general_data[category] == category_value]
        output_folder = os.path.join('graphs_general_aggregated', category)
        create_line_graph(category_data, category, category_value, output_folder)

print("Graph generation complete for aggregated general payments data.")

Graph generation complete for aggregated general payments data.


In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

# Function to load and combine data
def load_data(file_paths):
    dfs = [pd.read_csv(path) for path in file_paths]
    for i, df in enumerate(dfs):
        df['Year'] = 2021 + i
    return pd.concat(dfs, ignore_index=True)

# Load only research payments data
research_data = load_data(['cleaned_dataset/research_2021.csv', 'cleaned_dataset/research_2022.csv', 'cleaned_dataset/research_2023.csv'])

  dfs = [pd.read_csv(path) for path in file_paths]


In [17]:
# Convert category columns to strings
category_columns = ['Covered_Recipient_Type', 'Covered_Recipient_Primary_Type_1', 'Form_of_Payment_or_Transfer_of_Value']
for col in category_columns:
    research_data[col] = research_data[col].astype(str)

# Function to calculate percentage change
def calculate_percentage_change(current, previous):
    if previous == 0:
        return float('inf') if current > 0 else 0
    return ((current - previous) / previous) * 100

def create_line_graph(data, category, company, output_folder):
    fig, (ax_top, ax_stats, ax_main) = plt.subplots(3, 1, figsize=(15, 12), 
                                                    gridspec_kw={'height_ratios': [0.2, 0.2, 4]}, 
                                                    sharex=True)
    
    # Set custom colors
    male_color = '#0ab1ff'
    female_color = '#ff6161'
    
    # Set style using seaborn
    sns.set_style("whitegrid")
    
    # Prepare data for plotting
    years = [2021, 2022, 2023]
    male_values = []
    female_values = []
    male_counts = []
    female_counts = []
    
    # Collect data for each year
    for year in years:
        year_data = data[data['Year'] == year]
        
        # Get amounts
        male_amount = year_data[year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum()
        female_amount = year_data[year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        male_values.append(male_amount)
        female_values.append(female_amount)
        
        # Get counts
        male_count = year_data[year_data['Gender'] == 'M'].shape[0]
        female_count = year_data[year_data['Gender'] == 'F'].shape[0]
        male_counts.append(male_count)
        female_counts.append(female_count)
    
    # Set title and legend in the top subplot
    ax_top.set_title(f"{category} - {company}", pad=20, fontsize=14)
    ax_top.axis('off')
    ax_top.legend([plt.Line2D([0], [0], color=male_color, lw=2), 
                   plt.Line2D([0], [0], color=female_color, lw=2)], 
                  ['Male', 'Female'], 
                  loc='upper right', frameon=True)
    
    # Add statistics in the middle subplot
    ax_stats.axis('off')
    for i, year in enumerate(years):
        total_count = male_counts[i] + female_counts[i]
        male_percentage = (male_counts[i] / total_count * 100) if total_count > 0 else 0
        female_percentage = (female_counts[i] / total_count * 100) if total_count > 0 else 0
        
        if i > 0:
            male_change = calculate_percentage_change(male_values[i], male_values[i-1])
            female_change = calculate_percentage_change(female_values[i], female_values[i-1])
            
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%) ${male_change:.1f}%\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%) ${female_change:.1f}%\n")
        else:
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%)\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%)\n")

        # Calculate Gender Gap
        count_gap = male_counts[i] - female_counts[i]
        amount_gap = male_values[i] - female_values[i]
        
        # Calculate percentage gaps
        count_gap_percentage = (count_gap / total_count * 100) if total_count > 0 else 0
        amount_gap_percentage = ((male_values[i] - female_values[i]) / (male_values[i] + female_values[i]) * 100) if (male_values[i] + female_values[i]) > 0 else 0
        
        stats_text += f"Gender Gap: {count_gap} ({count_gap_percentage:.1f}%) ($: {amount_gap:.2f}, {amount_gap_percentage:.1f}%)"

        ax_stats.text(i/2, 0.5, stats_text, 
                      horizontalalignment='center', verticalalignment='center',
                      transform=ax_stats.transAxes, fontsize=10)
    
    # Plot lines in the main subplot
    ax_main.plot(years, male_values, marker='o', color=male_color, linewidth=2, markersize=6)
    ax_main.plot(years, female_values, marker='o', color=female_color, linewidth=2, markersize=6)
    
    # Customize the main plot
    ax_main.set_xlabel("Year", labelpad=10)
    ax_main.set_ylabel("Total Amount of Payment (USD)", labelpad=10)
    ax_main.set_xticks(years)
    ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
    ax_main.grid(False, axis='x')
    ax_main.set_ylim(bottom=0)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the plot with tight layout
    plt.tight_layout()
    
    # Convert category to string before using it in the filename
    category_str = str(category).replace('/', '_')
    
    plt.savefig(os.path.join(output_folder, f"{category_str}.png"), 
                bbox_inches='tight', dpi=300)
    plt.close()

# Categories to analyze for research data
categories = ['Covered_Recipient_Type', 'Covered_Recipient_Primary_Type_1', 'Form_of_Payment_or_Transfer_of_Value']

# Generate graphs for each company and category
for company in research_data['Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name'].unique():
    company_data = research_data[research_data['Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name'] == company]
    
    for category in categories:
        for category_value in company_data[category].unique():
            category_data = company_data[company_data[category] == category_value]
            output_folder = os.path.join('graphs_research', company, category)
            create_line_graph(category_data, category_value, company, output_folder)

print("Graph generation complete for research payments data.")

Graph generation complete for research payments data.


In [26]:
# Function to calculate percentage change
def calculate_percentage_change(current, previous):
    if previous == 0:
        return float('inf') if current > 0 else 0
    return ((current - previous) / previous) * 100

def create_line_graph(data, category, category_value, output_folder):
    fig, (ax_top, ax_stats, ax_main) = plt.subplots(3, 1, figsize=(15, 12), 
                                                    gridspec_kw={'height_ratios': [0.2, 0.2, 4]}, 
                                                    sharex=True)
    
    # Set custom colors
    male_color = '#0ab1ff'
    female_color = '#ff6161'
    
    # Set style using seaborn
    sns.set_style("whitegrid")
    
    # Prepare data for plotting
    years = [2021, 2022, 2023]
    male_values = []
    female_values = []
    male_counts = []
    female_counts = []
    
    # Collect data for each year
    for year in years:
        year_data = data[data['Year'] == year]
        
        # Get amounts
        male_amount = year_data[year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum()
        female_amount = year_data[year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        male_values.append(male_amount)
        female_values.append(female_amount)
        
        # Get counts
        male_count = year_data[year_data['Gender'] == 'M'].shape[0]
        female_count = year_data[year_data['Gender'] == 'F'].shape[0]
        male_counts.append(male_count)
        female_counts.append(female_count)
    
    # Set title and legend in the top subplot
    ax_top.set_title(f"{category} - {category_value}", pad=20, fontsize=14)
    ax_top.axis('off')
    ax_top.legend([plt.Line2D([0], [0], color=male_color, lw=2), 
                   plt.Line2D([0], [0], color=female_color, lw=2)], 
                  ['Male', 'Female'], 
                  loc='upper right', frameon=True)
    
    # Add statistics in the middle subplot
    ax_stats.axis('off')
    for i, year in enumerate(years):
        total_count = male_counts[i] + female_counts[i]
        male_percentage = (male_counts[i] / total_count * 100) if total_count > 0 else 0
        female_percentage = (female_counts[i] / total_count * 100) if total_count > 0 else 0
        
        if i > 0:
            male_change = calculate_percentage_change(male_values[i], male_values[i-1])
            female_change = calculate_percentage_change(female_values[i], female_values[i-1])
            
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%) ${male_change:.1f}%\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%) ${female_change:.1f}%\n")
        else:
            stats_text = (f"{year}:\n"
                          f"M:{male_counts[i]}({male_percentage:.1f}%)\n"
                          f"F:{female_counts[i]}({female_percentage:.1f}%)\n")

        # Calculate Gender Gap
        count_gap = male_counts[i] - female_counts[i]
        amount_gap = male_values[i] - female_values[i]
        
        # Calculate percentage gaps
        count_gap_percentage = (count_gap / total_count * 100) if total_count > 0 else 0
        amount_gap_percentage = ((male_values[i] - female_values[i]) / (male_values[i] + female_values[i]) * 100) if (male_values[i] + female_values[i]) > 0 else 0
        
        stats_text += f"Gender Gap: {count_gap} ({count_gap_percentage:.1f}%) ($: {amount_gap:.2f}, {amount_gap_percentage:.1f}%)"

        ax_stats.text(i/2, 0.5, stats_text, 
                      horizontalalignment='center', verticalalignment='center',
                      transform=ax_stats.transAxes, fontsize=10)
    
    # Plot lines in the main subplot
    ax_main.plot(years, male_values, marker='o', color=male_color, linewidth=2, markersize=6)
    ax_main.plot(years, female_values, marker='o', color=female_color, linewidth=2, markersize=6)
    
    # Customize the main plot
    ax_main.set_xlabel("Year", labelpad=10)
    ax_main.set_ylabel("Total Amount of Payment (USD)", labelpad=10)
    ax_main.set_xticks(years)
    ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
    ax_main.grid(False, axis='x')
    ax_main.set_ylim(bottom=0)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the plot with tight layout and shortened filename
    plt.tight_layout()
    
    # Convert category to string before using it in the filename
    category_str = str(category_value).replace('/', '_')
    
    plt.savefig(os.path.join(output_folder, f"{category_str}.png"), 
                bbox_inches='tight', dpi=300)
    plt.close()

# Categories to analyze for research data
categories = ['Covered_Recipient_Type', 'Covered_Recipient_Primary_Type_1', 'Form_of_Payment_or_Transfer_of_Value']

# Generate graphs for each category value across all companies
for category in categories:
    for category_value in research_data[category].unique():
        category_data = research_data[research_data[category] == category_value]
        output_folder = os.path.join('graphs_research_aggregated', category)
        create_line_graph(category_data, category, category_value, output_folder)

print("Graph generation complete for aggregated research payments data.")

Graph generation complete for aggregated research payments data.


In [19]:

def create_combined_gender_gap_graph(general_data, research_data, output_folder):
    fig, (ax_top, ax_stats, ax_main) = plt.subplots(3, 1, figsize=(15, 12), 
                                                    gridspec_kw={'height_ratios': [0.2, 0.3, 4]}, 
                                                    sharex=True)
    
    # Set custom colors
    general_color = '#FFD700'  # Yellow
    research_color = '#008000'  # Green
    
    # Set style using seaborn
    sns.set_style("whitegrid")
    
    # Prepare data for plotting
    years = [2021, 2022, 2023]
    general_gaps = []
    research_gaps = []
    general_counts = []
    research_counts = []
    
    # Collect data for each year
    for year in years:
        general_year_data = general_data[general_data['Year'] == year]
        research_year_data = research_data[research_data['Year'] == year]
        
        # Calculate gender gaps
        general_gap = general_year_data[general_year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum() - \
                      general_year_data[general_year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        research_gap = research_year_data[research_year_data['Gender'] == 'M']['Total_Amount_of_Payment_USDollars'].sum() - \
                       research_year_data[research_year_data['Gender'] == 'F']['Total_Amount_of_Payment_USDollars'].sum()
        
        general_gaps.append(general_gap)
        research_gaps.append(research_gap)
        
        # Get counts
        general_counts.append(general_year_data.shape[0])
        research_counts.append(research_year_data.shape[0])
    
    # Set title and legend in the top subplot
    ax_top.set_title("Gender Gap: General vs Research Payments", pad=20, fontsize=14)
    ax_top.axis('off')
    ax_top.legend([plt.Line2D([0], [0], color=general_color, lw=2), 
                   plt.Line2D([0], [0], color=research_color, lw=2)], 
                  ['General', 'Research'], 
                  loc='upper right', frameon=True)
    
    # Add statistics in the middle subplot
    ax_stats.axis('off')
    x_positions = [0.2, 0.5, 0.8]  # Left, center, right positions
    y_position = 0.5  # Vertical center

    for i, year in enumerate(years):
        if i > 0:
            general_amount_change = calculate_percentage_change(general_gaps[i], general_gaps[i-1])
            research_amount_change = calculate_percentage_change(research_gaps[i], research_gaps[i-1])
            general_count_change = calculate_percentage_change(general_counts[i], general_counts[i-1])
            research_count_change = calculate_percentage_change(research_counts[i], research_counts[i-1])
            
            stats_text = (f"{year}:\n"
                        f"General: Count: {general_counts[i]} ({general_count_change:.1f}%)\n"
                        f"Amount: ${general_gaps[i]:,.2f} ({general_amount_change:.1f}%)\n"
                        f"Research: Count: {research_counts[i]} ({research_count_change:.1f}%)\n"
                        f"Amount: ${research_gaps[i]:,.2f} ({research_amount_change:.1f}%)")
        else:
            stats_text = (f"{year}:\n"
                        f"General: Count: {general_counts[i]}\n"
                        f"Amount: ${general_gaps[i]:,.2f}\n"
                        f"Research: Count: {research_counts[i]}\n"
                        f"Amount: ${research_gaps[i]:,.2f}")

        ax_stats.text(x_positions[i], y_position, stats_text, 
                    horizontalalignment='center', 
                    verticalalignment='center',
                    transform=ax_stats.transAxes, 
                    fontsize=9,
                    bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', pad=2))
        # Plot lines in the main subplot
        ax_main.plot(years, general_gaps, marker='o', color=general_color, linewidth=2, markersize=6, label='General')
        ax_main.plot(years, research_gaps, marker='o', color=research_color, linewidth=2, markersize=6, label='Research')
        
        # Customize the main plot
        ax_main.set_xlabel("Year", labelpad=10)
        ax_main.set_ylabel("Gender Gap Amount (USD)", labelpad=10)
        ax_main.set_xticks(years)
        ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
        ax_main.grid(False, axis='x')
        ax_main.set_ylim(bottom=0)
    
    # Plot lines in the main subplot
    ax_main.plot(years, general_gaps, marker='o', color=general_color, linewidth=2, markersize=6, label='General')
    ax_main.plot(years, research_gaps, marker='o', color=research_color, linewidth=2, markersize=6, label='Research')
    
    # Customize the main plot
    ax_main.set_xlabel("Year", labelpad=10)
    ax_main.set_ylabel("Gender Gap Amount (USD)", labelpad=10)
    ax_main.set_xticks(years)
    ax_main.grid(True, axis='y', color='#E6E6E6', alpha=0.8)
    ax_main.grid(False, axis='x')
    ax_main.set_ylim(bottom=0)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the plot with tight layout
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, "gender_gap_comparison.png"), 
                bbox_inches='tight', dpi=300)
    plt.close()

# Load data (keep your existing data loading code)

# Create the combined gender gap graph
output_folder = 'graphs_combined'
create_combined_gender_gap_graph(general_data, research_data, output_folder)

print("Combined gender gap graph generation complete.")

Combined gender gap graph generation complete.


In [28]:
# Function to calculate records and gender counts
def calculate_records_and_gender(data, dataset_name):
    total_records = len(data)
    male_count = data[data['Gender'] == 'M'].shape[0]
    female_count = data[data['Gender'] == 'F'].shape[0]
    
    print(f"Dataset: {dataset_name}")
    print(f"Total Records: {total_records}")
    print(f"Total Males: {male_count}")
    print(f"Total Females: {female_count}")
    print("-" * 40)
    
    return total_records, male_count, female_count

# Calculate for each dataset
general_records, general_males, general_females = calculate_records_and_gender(general_data, "General Payments")
research_records, research_males, research_females = calculate_records_and_gender(research_data, "Research Payments")

# Calculate overall totals
total_records = general_records + research_records
total_males = general_males + research_males
total_females = general_females + research_females

# Display overall totals
print("Overall Totals:")
print(f"Total Records: {total_records}")
print(f"Total Males: {total_males}")
print(f"Total Females: {total_females}")

Dataset: General Payments
Total Records: 2040194
Total Males: 1411686
Total Females: 628508
----------------------------------------
Dataset: Research Payments
Total Records: 10822
Total Males: 8836
Total Females: 1986
----------------------------------------
Overall Totals:
Total Records: 2051016
Total Males: 1420522
Total Females: 630494


In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def create_gender_gap_graph(data):
    # Set style
    plt.style.use('seaborn-v0_8-whitegrid')
    
    fig, (ax_stats, ax_main) = plt.subplots(2, 1, figsize=(12, 10),
                                          gridspec_kw={'height_ratios': [1, 4]})
    
    # Calculate metrics by year and gender
    yearly_data = data.groupby(['Year', 'Gender'])['Total_Amount_of_Payment_USDollars'].agg([
        'sum', 'count'
    ]).reset_index()
    
    # Calculate gender gap for each year
    gaps = []
    for year in sorted(yearly_data['Year'].unique()):
        year_data = yearly_data[yearly_data['Year'] == year]
        male_data = year_data[year_data['Gender'] == 'M']
        female_data = year_data[year_data['Gender'] == 'F']
        
        if not male_data.empty and not female_data.empty:
            gap = male_data['sum'].iloc[0] - female_data['sum'].iloc[0]
            gaps.append({'Year': year, 'Gap': gap})
    
    gap_df = pd.DataFrame(gaps)
    
    # Plot gender gap line
    ax_main.plot(gap_df['Year'], gap_df['Gap'], 
                marker='o', color='#0ab1ff', linewidth=2)
    
    # Add statistics in top subplot
    ax_stats.axis('off')
    for year in sorted(yearly_data['Year'].unique()):
        year_data = yearly_data[yearly_data['Year'] == year]
        
        male_data = year_data[year_data['Gender'] == 'M']
        female_data = year_data[year_data['Gender'] == 'F']
        
        if not male_data.empty and not female_data.empty:
            male_amount = male_data['sum'].iloc[0]
            female_amount = female_data['sum'].iloc[0]
            male_count = male_data['count'].iloc[0]
            female_count = female_data['count'].iloc[0]
            
            total_count = male_count + female_count
            male_pct = (male_count/total_count * 100)
            female_pct = (female_count/total_count * 100)
            
            gap = male_amount - female_amount
            gap_pct = (gap/(male_amount + female_amount) * 100)
            
            stats_text = (f"{year}:\n"
                         f"M:{male_count}({male_pct:.1f}%)\n"
                         f"F:{female_count}({female_pct:.1f}%)\n"
                         f"Gap: {gap:,.0f} ({gap_pct:.1f}%)")
            
            ax_stats.text((year-2021)/2, 0.5, stats_text,
                         horizontalalignment='center',
                         verticalalignment='center',
                         transform=ax_stats.transAxes)

    # Customize main plot
    ax_main.set_title('Gender Payment Gap Over Time', pad=20)
    ax_main.set_xlabel('Year')
    ax_main.set_ylabel('Payment Gap Amount (Male - Female) in USD')
    ax_main.grid(True, alpha=0.3)
    ax_main.set_xticks([2021, 2022, 2023])
    
    plt.tight_layout()
    return fig

# Usage:
fig = create_gender_gap_graph(research_data)
plt.savefig('gender_payment_gap.png', bbox_inches='tight', dpi=300)
plt.close()