In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('../data/co2_emissions.csv') as f:
    df = pd.read_csv(f)

In [3]:
# Define a function to calculate negative log-likelihood for a given segment
def negative_log_likelihood(data):
    if len(data) < 2:
        return -np.inf
    
    mean = np.mean(data)
    variance = np.var(data, ddof=1) if len(data) > 1 else 0.0001  # Avoid division by zero
    ll = -sum((data - mean) ** 2 / (2 * variance)) - len(data) * np.log(np.sqrt(2 * np.pi * variance))
    return -ll  # Return negative log-likelihood

In [4]:
# Define a function to find the most anomalous segment for a given country
def find_most_anomalous_segment(values, years):
    n = len(values)
    min_nll = np.inf
    best_q = None
    best_p = None
    
    # Consider all possible subsegments
    for q in range(n):
        for p in range(q + 1, n + 1):
            segment_values = values[q:p]
            nll = negative_log_likelihood(segment_values)
            
            if nll < min_nll and not np.isinf(nll):  # Skip segments with NLL = -inf
                min_nll = nll
                best_q = years[q]
                best_p = years[p - 1]
    
    return best_q, best_p, min_nll

In [5]:
# Group by country_code and find the most anomalous segment for each country
anomalies = []

for country, group in df.groupby('country_code'):
    values = group['value'].values
    years = group['year'].values
    
    if len(values) > 1:
        best_q, best_p, min_nll = find_most_anomalous_segment(values, years)
        
        if best_q is not None and best_p is not None:
            anomalies.append((country, best_q, best_p, min_nll))
        else:
            anomalies.append((country, None, None, None))
    else:
        anomalies.append((country, None, None, None))

# Sort segments by NLL in ascending order (most negative first)
anomalies.sort(key=lambda x: x[3], reverse=False)

# Display the sorted results
for country, q, p, nll in anomalies:
    print(f"Country: {country}")
    if q is not None and p is not None:
        print(f"  Most anomalous segment from year {q} to {p}:")
        print(f"  Negative Log-Likelihood: {nll}")
    else:
        print("  No clear anomaly detected.")

  ll = -sum((data - mean) ** 2 / (2 * variance)) - len(data) * np.log(np.sqrt(2 * np.pi * variance))
  ll = -sum((data - mean) ** 2 / (2 * variance)) - len(data) * np.log(np.sqrt(2 * np.pi * variance))


Country: TUV
  Most anomalous segment from year 2002 to 2019:
  Negative Log-Likelihood: -585.8004939316281
Country: BTN
  Most anomalous segment from year 1970 to 1976:
  Negative Log-Likelihood: -232.62941608415275
Country: SYC
  Most anomalous segment from year 1963 to 1969:
  Negative Log-Likelihood: -227.77738582023312
Country: TON
  Most anomalous segment from year 1960 to 1966:
  Negative Log-Likelihood: -227.77738582023312
Country: COM
  Most anomalous segment from year 1970 to 1974:
  Negative Log-Likelihood: -159.202771790078
Country: WSM
  Most anomalous segment from year 1982 to 1986:
  Negative Log-Likelihood: -152.27129998447853
Country: KIR
  Most anomalous segment from year 2007 to 2009:
  Negative Log-Likelihood: -93.36873919717603
Country: BLZ
  Most anomalous segment from year 1982 to 1984:
  Negative Log-Likelihood: -89.20985611381636
Country: VGB
  Most anomalous segment from year 2011 to 2013:
  Negative Log-Likelihood: -89.20985611381636
Country: PLW
  Most anoma