In [None]:
import pandas as pd
import numpy as np

In [None]:
with open('../data/continents2.csv') as f:
  regions = pd.read_csv(f)
  print(regions['sub-region'].unique())

In [None]:
def clusterByRegion(Data, MeanAggregation=True):
  """
    Clusters the Dataset by region applying the sub regions found in the continents2 dataset

    Parameters:
        x (pd.DataFrame) the dataset to be grouped
        MeanAggregation (boolean) if true, takes the mean of Data's Features, if false, takes the sum of the features

    Returns:
        grouped (pd.DataFrame) a grouped dataframe
    """
  regions = pd.read_csv("../data/continents2.csv")
  regions = regions[['alpha-3', 'sub-region']]

  merge = Data.merge(regions, how = "left", left_on = 'country_code', right_on ='alpha-3' )
  merge.drop(['country_code', 'country_name', "alpha-3"], axis=1, inplace=True)

  if MeanAggregation:
    grouped = merge.groupby(['sub-region', 'year'], as_index=False).agg('mean')
  else:
    grouped = merge.groupby(['sub-region', 'year'],as_index=False).agg('sum')

  return grouped


In [None]:
data = pd.read_csv('../data/co2_emissions.csv')
df = clusterByRegion(data, False)
print(df)

Group the data by `sub-region`, sort each group by year, compute the anomaly using the LLR function, run permutation testing, filter for significant anomalies (p-value < 0.05), and finally sort the results by LLR in descending order.

In [None]:
def compute_llr(x):
    """
    Compute the maximum log-likelihood ratio (LLR) for a change point in a 1D array x.
    Returns the best LLR score, the index of the best change point, and the anomalous subset S*.

    Parameters:
        x (np.array): Array of numeric data.

    Returns:
        best_llr (float): Maximum improvement in fit (LLR score).
        best_t (int): Index of the best change point.
        best_subset (np.array): The anomalous subset, i.e. x[:best_t].
    """
    n = len(x)
    if n < 2:
        return None, None, None

    mean_all = np.mean(x)
    sse_all = np.sum((x - mean_all) ** 2)

    best_llr = -np.inf
    best_t = None
    best_subset = None

    for t in range(1, n):
        seg1 = x[:t]
        seg2 = x[t:]
        mean1 = np.mean(seg1)
        mean2 = np.mean(seg2)
        sse1 = np.sum((seg1 - mean1) ** 2)
        sse2 = np.sum((seg2 - mean2) ** 2)
        sse_split = sse1 + sse2
        llr = sse_all - sse_split
        if llr > best_llr:
            best_llr = llr
            best_t = t
            best_subset = x[:t]

    return best_llr, best_t, best_subset

In [None]:
results = {}

# Process the data on a per-region basis using 'sub-region'
for region, group in df.groupby('sub-region'):
    group_sorted = group.sort_values('year') # Sort by year
    emissions = group_sorted['value'].values # Extract emissions values
    if len(emissions) < 2: # Skip if not enough data points
        continue

    observed_llr, best_index, anomalous_subset = compute_llr(emissions) # Compute LLR

    # Extract all years corresponding to the anomalous subset
    anomalous_years = group_sorted['year'].iloc[:best_index].tolist()
    best_year = group_sorted.iloc[best_index]['year'] # Year of the most significant change point

    # Perform permutation testing to compute p-value
    num_permutations = 1000
    llr_permutations = np.zeros(num_permutations)
    for i in range(num_permutations):
        emissions_perm = np.random.permutation(emissions) # Randomly permute emissions
        llr_perm, _, _ = compute_llr(emissions_perm) # Compute LLR for permuted data

        # Handle cases where LLR is None
        llr_permutations[i] = llr_perm if llr_perm is not None else -np.inf 

    # Calculate p-value as the proportion of permuted LLRs greater than or equal to observed LLR
    p_value = np.mean(llr_permutations >= observed_llr)

    # Store results
    results[region] = {
        'observed_llr': observed_llr, # Observed LLR score
        'best_change_point_index': best_index, # Index of the most significant change point
        'best_year': best_year, # Year of the most significant change point
        'anomalous_years': anomalous_years, # Years corresponding to the anomalous subset
        'p_value': p_value, # p-value from permutation testing
        'n_points': len(emissions)  # Number of data points in the region
    }

# Filter and sort results: only include countries with a p-value < 0.05. These are the countries
# that show significant changes in emissions over time.
filtered_sorted_results = sorted(
    [(country, res) for country, res in results.items() if res['p_value'] < 0.05],
    key=lambda item: item[1]['observed_llr'], reverse=True) # Sort by observed LLR.
    # We want to regions with the most significant changes first.
    # Note: You can also sort by p-value if we want to prioritize regions with the lowest p-values.)

# Display the filtered and sorted results per region
for region, res in filtered_sorted_results:
    print(f"\nRegion: {region}")
    print(f"  Observed LLR: {res['observed_llr']}")
    print(f"  Best Change Point Index: {res['best_change_point_index']} (Year: {res['best_year']})")
    print(f"  Anomalous Years: {res['anomalous_years']}")
    print(f"  p-value from permutation testing: {res['p_value']}")
    print(f"  Number of Data Points: {res['n_points']}")

# Graphing Change Point Anomalies
## Emissions

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

def visualize_country_anomalies(df, results, filtered_sorted_results):
    for region, res in filtered_sorted_results:
        group = df[df['sub-region'] == region].sort_values('year')
        years = group['year'].values
        emissions = group['value'].values

        plt.figure(figsize=(10, 6))
        plt.plot(years, emissions, marker='o', label='CO2 Emissions')
        plt.axvline(x=res['best_year'], color='red', linestyle='--', label='Change Point')

        #highlight anomalous years
        for year in res['anomalous_years']:
            plt.axvspan(year - 0.5, year + 0.5, color='orange', alpha=0.3)

        #legend for anomalous years
        anomaly_patch = Patch(facecolor='orange', edgecolor='orange', alpha=0.3, label='Anomalous Years')

        plt.title(f"CO₂ Emissions Change Point: {region}")
        plt.xlabel("Year")
        plt.ylabel("CO₂ Emissions (kt)")
        plt.legend(handles=[plt.Line2D([0], [0], color='blue', marker='o', label='CO₂ Emissions'),
                            plt.Line2D([0], [0], color='red', linestyle='--', label='Change Point'),
                            anomaly_patch])
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        #print metadata
        print(f"Region: {region}")
        print(f"  Observed LLR: {res['observed_llr']}")
        print(f"  Best Change Point Index: {res['best_change_point_index']} (Year: {res['best_year']})")
        print(f"  Anomalous Years: {res['anomalous_years']}")
        print(f"  p-value from permutation testing: {res['p_value']}")
        print(f"  Number of Data Points: {res['n_points']}")
        print("-" * 60)

#run plots
visualize_country_anomalies(df, results, filtered_sorted_results[:5])



In [None]:
import matplotlib.pyplot as plt

def plot_emissions_by_region(df):
    # Group by sub-region and year, then compute total emissions
    region_yearly_emissions = df.groupby(['sub-region', 'year'])['value'].sum().reset_index()

    # Get list of unique sub-regions
    regions = region_yearly_emissions['sub-region'].unique()

    plt.figure(figsize=(14, 7))

    # Plot each region's emissions over time
    for region in regions:
        region_data = region_yearly_emissions[region_yearly_emissions['sub-region'] == region]
        plt.plot(region_data['year'], region_data['value'], label=region)

    plt.title("Emissions Over Time by Region")
    plt.xlabel("Year")
    plt.ylabel("Total Emissions")
    plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_emissions_by_region(df)


## Global Change Point Analysis

In [None]:
# Aggregate global CO2 emissions (sum across all regions for each year)
global_emissions = df.groupby('year')['value'].sum().reset_index()

# Apply change point detection to global emissions
global_emissions_values = global_emissions['value'].values
observed_llr, best_index, anomalous_subset = compute_llr(global_emissions_values)

# Get the year of the change point and the anomalous years
years = global_emissions['year'].values
best_year = years[best_index]
anomalous_years = years[:best_index].tolist()

# Perform permutation testing to compute p-value
num_permutations = 1000
llr_permutations = np.zeros(num_permutations)
for i in range(num_permutations):
    emissions_perm = np.random.permutation(global_emissions_values)
    llr_perm, _, _ = compute_llr(emissions_perm)
    llr_permutations[i] = llr_perm if llr_perm is not None else -np.inf

# Calculate p-value
p_value = np.mean(llr_permutations >= observed_llr)

# Store global results
global_result = {
    'observed_llr': observed_llr,
    'best_change_point_index': best_index,
    'best_year': best_year,
    'anomalous_years': anomalous_years,
    'p_value': p_value,
    'n_points': len(global_emissions_values)
}

# Display the global results
print("Global Change Point Analysis:")
print(f"  Observed LLR: {global_result['observed_llr']}")
print(f"  Best Change Point Index: {global_result['best_change_point_index']} (Year: {global_result['best_year']})")
print(f"  Anomalous Years: {global_result['anomalous_years']}")
print(f"  p-value from permutation testing: {global_result['p_value']}")
print(f"  Number of Data Points: {global_result['n_points']}")

In [None]:
# Visualize global change point
plt.figure(figsize=(12, 7))

# Plot emissions data
plt.plot(years, global_emissions_values, marker='o', linewidth=2, color='blue', label='Global CO2 Emissions')

# Add vertical line at change point
plt.axvline(x=best_year, color='red', linestyle='--', linewidth=2, label='Change Point')

# Highlight anomalous years
for year in anomalous_years:
    plt.axvspan(year - 0.5, year + 0.5, color='orange', alpha=0.3)

# Add legend for anomalous years
anomaly_patch = Patch(facecolor='orange', edgecolor='orange', alpha=0.3, label='Anomalous Years')

plt.title("Global CO₂ Emissions Change Point", fontsize=16)
plt.xlabel("Year", fontsize=14)
plt.ylabel("CO₂ Emissions (kt)", fontsize=14)
plt.legend(handles=[
    plt.Line2D([0], [0], color='blue', marker='o', label='CO₂ Emissions'),
    plt.Line2D([0], [0], color='red', linestyle='--', label='Change Point'),
    anomaly_patch
], fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

# Print significance statement
if p_value < 0.05:
    print(f"The global change point in {best_year} is statistically significant (p-value: {p_value:.4f})")
else:
    print(f"The global change point in {best_year} is not statistically significant (p-value: {p_value:.4f})")