In [1]:
import pandas as pd

In [2]:
import os

In [3]:
os.chdir("D:\Courses\CE 7090 -Statistical and Econometric Methods in Civil Engineering II\HomeWork\HW-4")

In [4]:
df = pd.read_csv('HW4 Data.txt', delim_whitespace=True, header=None, na_values=-999)

In [5]:
# Basic inspection
print(df.shape)        # should be (999, 49)
print(df[[3,24,33]].head())  # peek at severity (X4), age (X25), speed limit (X34)

(999, 49)
   3   24  33
0   0  63  60
1   0   0  60
2   0  16  55
3   1  30  60
4   1  53  60


In [6]:
# Count missing values per column
missing_counts = df.isna().sum()
print(missing_counts[missing_counts > 0])

21    879
26     27
27     27
29     12
37    619
39    180
40    180
41    180
42    180
43    180
44    180
45    916
46    715
47    715
dtype: int64


In [7]:
# Drop columns with > 500 missing values (arbitrary threshold, e.g., X22, X46, X47, X48)
cols_to_drop = [col for col in df.columns if df[col].isna().sum() > 500]
df.drop(columns=cols_to_drop, inplace=True)

# Drop rows where age (X25) is missing or zero (assuming 0 means unknown age)
df = df[df[24] != 0]  # X25 (driver age) is at index 24

# Verify remaining shape after drops
print(df.shape)


(982, 44)


In [8]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

# -------------------------------
# Step 1: Data Loading and Preprocessing
# -------------------------------

# Load the dataset. (Assumes 'HW4 Data.txt' is in the working directory.)
# We treat -999 as missing.


# For clarity, let’s define column indices (0-indexed) for the variables we need:
# X4 (injury severity) is column 3.
# X20 (weather conditions) is column 19.
# X21 (lighting condition) is column 20.
# X25 (driver age) is column 24.
# X26 (driver gender) is column 25.
# X27 (ejection status) is column 26.
# X28 (restraining system) is column 27.

# Create a new DataFrame with our variables of interest:
# Outcome: injury severity (0,1,2,3)
df['severity'] = df[3]

# Predictor 1: Adverse Weather (X20)
# Code: 1 means clear or partly cloudy. We define adverse weather as any condition not equal to 1.
df['adverse_weather'] = (df[19] != 1).astype(int)

# Predictor 2: Dark Conditions (X21)
# We assume lighting conditions 4, 5, 6 (dark scenarios) are "dark."
df['dark'] = df[20].apply(lambda x: 1 if x in [4, 5, 6] else 0)

# Predictor 3: Driver's Age (X25)
df['driver_age'] = df[24]

# Predictor 4: Driver's Gender (X26)
# Already coded: 0 = female, 1 = male.
df['male'] = df[25]

# Predictor 5: Ejection (X27)
# For simplicity, we define ejected = 1 if X27 is 2 (partially) or 3 (totally) ejected.
df['ejected'] = df[26].apply(lambda x: 1 if x in [2, 3] else 0)

# Predictor 6: No Restraint (X28)
# We define no_restraint = 1 if X28 equals 3 (no restraints used).
df['no_restraint'] = (df[27] == 3).astype(int)

# Drop any rows with missing values in these new variables or the outcome
df_model = df[['severity', 'adverse_weather', 'dark', 'driver_age', 'male', 'ejected', 'no_restraint']].dropna()

# Define the outcome and design matrix.
y = df_model['severity']
# We include a constant automatically via OrderedModel (or add it explicitly).
X = df_model[['adverse_weather', 'dark', 'driver_age', 'male', 'ejected', 'no_restraint']]

# -------------------------------
# Step 2: Fit a Single-Class Ordered Probit Model
# -------------------------------

# Fit the ordered probit model using the alternative predictors.
model_single = OrderedModel(y, X, distr='probit')
result_single = model_single.fit(method='bfgs')
print("Single-Class Ordered Probit Results:")
print(result_single.summary())

# -------------------------------
# Step 3: Outline of a Latent Class Ordered Probit Model via EM
# -------------------------------

# Here we present a skeleton for a latent class model with 2 classes.
# (A full implementation requires a careful likelihood calculation for ordinal data.)
max_iter = 100
tol_ll = 1e-4
tol_params = 1e-4

N = len(df_model)
K = 2  # We use 2 classes for this example

# Random initialization for responsibilities (avoid symmetric 0.5 exactly)
np.random.seed(42)
r = np.zeros((N, K))
for i in range(N):
    r[i, 0] = np.random.uniform(0.4, 0.6)
    r[i, 1] = 1 - r[i, 0]

prev_ll = -np.inf
prev_class_params = [None] * K

for iteration in range(max_iter):
    # --- M-step: Estimate class-specific parameters using weighted ordered probit ---
    class_params = []
    class_thresholds = []
    for c in range(K):
        weights = r[:, c]
        # Fit an ordered probit model with weights for class c
        model_c = OrderedModel(y, X, distr='probit')
        res_c = model_c.fit(method='bfgs', weights=weights, disp=False)
        n_thresholds = model_c.k_levels - 1
        params_c = res_c.params
        # Separate coefficients and thresholds (assuming thresholds are the last parameters)
        class_params.append(params_c[:-n_thresholds])
        class_thresholds.append(params_c[-n_thresholds:])
    
    # Update mixing proportions
    pi = r.mean(axis=0)
    
    # --- E-step: Update responsibilities and compute overall log-likelihood ---
    ll_total = 0
    for i in range(N):
        L = np.zeros(K)
        for c in range(K):
            xb = np.dot(X.iloc[i], class_params[c])
            # Compute likelihood for observation i in class c based on the ordinal outcome
            # We assume 4 ordered outcomes (0, 1, 2, 3):
            if y.iloc[i] == 0:
                L[c] = norm.cdf(class_thresholds[c][0] - xb)
            elif y.iloc[i] == 1:
                L[c] = norm.cdf(class_thresholds[c][1] - xb) - norm.cdf(class_thresholds[c][0] - xb)
            elif y.iloc[i] == 2:
                L[c] = norm.cdf(class_thresholds[c][2] - xb) - norm.cdf(class_thresholds[c][1] - xb)
            elif y.iloc[i] == 3:
                L[c] = 1 - norm.cdf(class_thresholds[c][-1] - xb)
            # Avoid zero likelihoods:
            L[c] = max(L[c], 1e-8)
        # Total likelihood for observation i across classes:
        total_i = np.sum([pi[c] * L[c] for c in range(K)])
        ll_total += np.log(total_i)
        # Update responsibilities:
        for c in range(K):
            r[i, c] = (pi[c] * L[c]) / total_i

    # Convergence checks on log-likelihood and parameter change
    ll_change = np.abs(ll_total - prev_ll)
    param_change = 0
    if iteration > 0:
        for c in range(K):
            diff = np.abs(class_params[c] - prev_class_params[c])
            param_change = max(param_change, np.max(diff))
    
    print(f"Iteration {iteration}: Log-likelihood = {ll_total:.4f}, ll_change = {ll_change:.6f}, param_change = {param_change:.6f}")
    
    if ll_change < tol_ll and param_change < tol_params:
        print(f"Converged at iteration {iteration}")
        break
    
    prev_ll = ll_total
    prev_class_params = [cp.copy() for cp in class_params]

print("\nFinal Mixing Proportions:", pi)
for c in range(K):
    print(f"\nClass {c+1} Coefficients:")
    print(class_params[c])
    print(f"Class {c+1} Thresholds:")
    print(class_thresholds[c])
    
# Optionally, assign each observation to the class with the highest posterior probability:
df_model['predicted_class'] = np.argmax(r, axis=1) + 1  # Classes as 1-indexed
print("\nDistribution of Observations by Predicted Class:")
print(df_model['predicted_class'].value_counts())


Optimization terminated successfully.
         Current function value: 0.684820
         Iterations: 40
         Function evaluations: 44
         Gradient evaluations: 44
Single-Class Ordered Probit Results:
                             OrderedModel Results                             
Dep. Variable:               severity   Log-Likelihood:                -672.49
Model:                   OrderedModel   AIC:                             1363.
Method:            Maximum Likelihood   BIC:                             1407.
Date:                Sun, 16 Mar 2025                                         
Time:                        20:52:35                                         
No. Observations:                 982                                         
Df Residuals:                     973                                         
Df Model:                           6                                         
                      coef    std err          z      P>|z|      [0.025      0.9



Iteration 0: Log-likelihood = -1830.6116, ll_change = inf, param_change = 0.000000
Iteration 1: Log-likelihood = -1830.6116, ll_change = 0.000000, param_change = 0.000000
Converged at iteration 1

Final Mixing Proportions: [0.49804541 0.50195459]

Class 1 Coefficients:
adverse_weather   -0.273485
dark              -0.047624
driver_age        -0.002171
male              -0.065744
ejected            2.884690
no_restraint       0.774311
dtype: float64
Class 1 Thresholds:
0/1    0.295718
1/2    0.320259
2/3   -0.380990
dtype: float64

Class 2 Coefficients:
adverse_weather   -0.273485
dark              -0.047624
driver_age        -0.002171
male              -0.065744
ejected            2.884690
no_restraint       0.774311
dtype: float64
Class 2 Thresholds:
0/1    0.295718
1/2    0.320259
2/3   -0.380990
dtype: float64

Distribution of Observations by Predicted Class:
predicted_class
2    982
Name: count, dtype: int64
