In [1]:
import pandas as pd

In [2]:
import os

In [3]:
os.chdir("D:\Courses\CE 7090 -Statistical and Econometric Methods in Civil Engineering II\HomeWork\HW-4")

In [4]:
df = pd.read_csv('HW4 Data.txt', delim_whitespace=True, header=None, na_values=-999)

In [5]:
# Basic inspection
print(df.shape)        # should be (999, 49)
print(df[[3,24,33]].head())  # peek at severity (X4), age (X25), speed limit (X34)

(999, 49)
   3   24  33
0   0  63  60
1   0   0  60
2   0  16  55
3   1  30  60
4   1  53  60


In [6]:
# Count missing values per column
missing_counts = df.isna().sum()
print(missing_counts[missing_counts > 0])

21    879
26     27
27     27
29     12
37    619
39    180
40    180
41    180
42    180
43    180
44    180
45    916
46    715
47    715
dtype: int64


In [7]:
# Drop columns with > 500 missing values (arbitrary threshold, e.g., X22, X46, X47, X48)
cols_to_drop = [col for col in df.columns if df[col].isna().sum() > 500]
df.drop(columns=cols_to_drop, inplace=True)

# Drop rows where age (X25) is missing or zero (assuming 0 means unknown age)
df = df[df[24] != 0]  # X25 (driver age) is at index 24

# Verify remaining shape after drops
print(df.shape)


(982, 44)


In [8]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

# -------------------------------
# Step 1: Data Preparation
# -------------------------------
# Load data as before, and create your new predictors.
# (For brevity, we reuse the alternative variables code from the previous example.)
# df = pd.read_csv('HW4 Data.txt', delim_whitespace=True, header=None, na_values=-999)

# Create outcome and some alternative predictors.
# Outcome: Injury severity (X4 is column 3, zero-indexed)
df['severity'] = df[3]
# Let’s assume we want to use these predictors (example alternatives):
df['adverse_weather'] = (df[19] != 1).astype(int)  # X20: adverse weather if not 1
df['dark'] = df[20].apply(lambda x: 1 if x in [4, 5, 6] else 0)  # X21: dark conditions
df['driver_age'] = df[24]   # X25: driver’s age
df['male'] = df[25]         # X26: gender (already coded 0/1)
df['ejected'] = df[26].apply(lambda x: 1 if x in [2, 3] else 0)  # X27: ejection status
df['no_restraint'] = (df[27] == 3).astype(int)  # X28: no restraint used

# IMPORTANT: For the segment-based model, we need a segment identifier.
# We'll use X3 (Link ID) as the segment identifier.
df['segment'] = df[2]

# Now, select the variables we need and drop missing values:
cols = ['segment', 'severity', 'adverse_weather', 'dark', 'driver_age', 
        'male', 'ejected', 'no_restraint']
df_model = df[cols].dropna()

# Define outcome and predictors:
y = df_model['severity']
X = df_model[['adverse_weather', 'dark', 'driver_age', 'male', 'ejected', 'no_restraint']]

# -------------------------------
# Step 2: Segment-Based EM Setup
# -------------------------------
# In a segment-based model, we treat all accidents on a given segment as a group.
# We'll compute the responsibilities (posterior probabilities) at the segment level.

# Get a list of unique segments and create a mapping: segment -> indices (rows)
segments = df_model['segment'].unique()
seg_groups = df_model.groupby('segment').indices

# For initialization, assign a random responsibility for each segment.
# For example, for a 2-class model:
np.random.seed(42)
K = 2
seg_resp = {}  # dictionary mapping segment -> array of responsibilities (length K)
for seg in segments:
    # Randomly initialize so they are not exactly 0.5
    r_val = np.random.uniform(0.4, 0.6)
    seg_resp[seg] = np.array([r_val, 1 - r_val])

# For individual observations, we later assign each observation the responsibility
# of its segment.
# Set up convergence parameters:
max_iter = 100
tol_ll = 1e-4
tol_params = 1e-4
prev_ll = -np.inf

# We'll store class-specific parameters from the weighted ordered probit models:
prev_class_params = [None] * K

# -------------------------------
# Step 3: EM Algorithm (Segment-Based)
# -------------------------------
# The idea is similar to the accident-based model but with responsibilities updated per segment.

for iteration in range(max_iter):
    # --- M-step: Fit class-specific ordered probit models using weights from segments ---
    # For each observation, assign its weight as the segment’s responsibility.
    # Create a weight array of shape (N, K) for each class.
    N = len(df_model)
    # For class c, weight for observation i is the responsibility of the segment that observation belongs to.
    weights_per_class = np.zeros((N, K))
    for seg, indices in seg_groups.items():
        for c in range(K):
            weights_per_class[indices, c] = seg_resp[seg][c]
    
    # Now, for each class, fit an ordered probit model using the corresponding weights.
    class_params = []
    class_thresholds = []
    for c in range(K):
        # Use the weight for class c for each observation
        weights = weights_per_class[:, c]
        model_c = OrderedModel(y, X, distr='probit')
        res_c = model_c.fit(method='bfgs', weights=weights, disp=False)
        n_thresholds = model_c.k_levels - 1
        params_c = res_c.params
        class_params.append(params_c[:-n_thresholds])
        class_thresholds.append(params_c[-n_thresholds:])
    
    # Update mixing proportions at the segment level:
    # For each segment, compute the segment likelihood under each class, then update responsibilities.
    # Also compute overall log-likelihood.
    pi = np.zeros(K)
    ll_total = 0
    for seg, indices in seg_groups.items():
        # For each segment, calculate the product of likelihoods for all observations in that segment,
        # separately for each class.
        seg_ll = np.zeros(K)
        for c in range(K):
            seg_ll_c = 0  # we'll work in log space to avoid underflow
            for i in indices:
                xb = np.dot(X.iloc[i], class_params[c])
                # Compute likelihood for observation i given its outcome.
                if y.iloc[i] == 0:
                    l_val = norm.cdf(class_thresholds[c][0] - xb)
                elif y.iloc[i] == 1:
                    l_val = norm.cdf(class_thresholds[c][1] - xb) - norm.cdf(class_thresholds[c][0] - xb)
                elif y.iloc[i] == 2:
                    l_val = norm.cdf(class_thresholds[c][2] - xb) - norm.cdf(class_thresholds[c][1] - xb)
                elif y.iloc[i] == 3:
                    l_val = 1 - norm.cdf(class_thresholds[c][-1] - xb)
                # Avoid zeros by taking max:
                l_val = max(l_val, 1e-8)
                seg_ll_c += np.log(l_val)
            seg_ll[c] = np.exp(seg_ll_c)  # Convert back from log-sum to product-like likelihood
        
        # Now update the responsibility for this segment.
        # Compute weighted likelihoods: pi_c (old mixing proportion) is not yet updated,
        # so for the first iteration we assume uniform mixing, or we can use current seg_resp as prior.
        # For updating, we use current mixing proportions computed over segments.
        # Here, we'll simply assume equal prior if iteration==0, otherwise average from previous seg_resp.
        if iteration == 0:
            prior = np.ones(K) / K
        else:
            # For simplicity, average the responsibilities over segments from last iteration
            prior = np.array([np.mean([seg_resp[s][c] for s in seg_groups]) for c in range(K)])
        weighted_ll = prior * seg_ll
        total_weighted = np.sum(weighted_ll)
        # Update responsibility for this segment:
        new_resp = weighted_ll / total_weighted
        seg_resp[seg] = new_resp
        # Add to overall log-likelihood (using weighted likelihood for the segment)
        ll_total += np.log(total_weighted)
    
    # Update overall mixing proportions from segments
    # For each class, take the average responsibility across segments weighted by number of accidents.
    counts = np.array([len(indices) for indices in seg_groups.values()])
    weighted_sum = np.zeros(K)
    total_obs = 0
    for seg, indices in seg_groups.items():
        weighted_sum += seg_resp[seg] * len(indices)
        total_obs += len(indices)
    pi = weighted_sum / total_obs
    
    # Convergence checks on log-likelihood and parameter changes:
    ll_change = np.abs(ll_total - prev_ll)
    param_change = 0
    if iteration > 0:
        for c in range(K):
            diff = np.abs(class_params[c] - prev_class_params[c])
            param_change = max(param_change, np.max(diff))
    
    print(f"Iteration {iteration}: Log-likelihood = {ll_total:.4f}, ll_change = {ll_change:.6f}, param_change = {param_change:.6f}")
    
    if ll_change < tol_ll and param_change < tol_params:
        print(f"Converged at iteration {iteration}")
        break
    
    prev_ll = ll_total
    prev_class_params = [cp.copy() for cp in class_params]

# -------------------------------
# Step 4: Displaying the Results
# -------------------------------
# Print final segment-based mixing proportions and class-specific parameters.
print("\nFinal Mixing Proportions (Segment-Based):", pi)
for c in range(K):
    print(f"\nClass {c+1} Coefficients:")
    print(class_params[c])
    print(f"Class {c+1} Thresholds:")
    print(class_thresholds[c])

# Optionally, assign each segment to a class (the one with highest posterior probability)
# and then propagate that to individual observations.
segment_assignments = {seg: np.argmax(resp) + 1 for seg, resp in seg_resp.items()}
df_model['predicted_class'] = df_model['segment'].map(segment_assignments)
print("\nDistribution of Observations by Predicted Segment Class:")
print(df_model['predicted_class'].value_counts())




Iteration 0: Log-likelihood = -1830.6116, ll_change = inf, param_change = 0.000000
Iteration 1: Log-likelihood = -1830.6116, ll_change = 0.000000, param_change = 0.000000
Converged at iteration 1

Final Mixing Proportions (Segment-Based): [0.5 0.5]

Class 1 Coefficients:
adverse_weather   -0.273485
dark              -0.047624
driver_age        -0.002171
male              -0.065744
ejected            2.884690
no_restraint       0.774311
dtype: float64
Class 1 Thresholds:
0/1    0.295718
1/2    0.320259
2/3   -0.380990
dtype: float64

Class 2 Coefficients:
adverse_weather   -0.273485
dark              -0.047624
driver_age        -0.002171
male              -0.065744
ejected            2.884690
no_restraint       0.774311
dtype: float64
Class 2 Thresholds:
0/1    0.295718
1/2    0.320259
2/3   -0.380990
dtype: float64

Distribution of Observations by Predicted Segment Class:
predicted_class
1    982
Name: count, dtype: int64
