In [3]:
import pandas as pd
import pandas as pd
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

df = pd.read_csv(r'CLPM_data.csv')
# Aggregate scores by test (mean score for each test: BDI, LSAS, CFS)
df['Test'] = df['Question'].str.extract(r'([A-Za-z]+)')
df_mean_scores = df.groupby(['YearMonth', 'Participant_Number', 'Test'], as_index=False)['Score'].mean()

# Pivot the dataset to wide format for CLPM
df_wide = df_mean_scores.pivot_table(
    index='Participant_Number',
    columns=['YearMonth', 'Test'],
    values='Score'
)

# Ensure the columns are sorted chronologically by converting YearMonth to Period objects
sorted_columns = sorted(df_wide.columns, key=lambda x: (pd.Period(x[0], freq='M').year, pd.Period(x[0], freq='M').month))
df_wide = df_wide[sorted_columns]

# Display a preview of the wide-format dataset
df_wide.head()

# CLPM Analysis Setup
from statsmodels.tsa.vector_ar.var_model import VAR

# Drop rows with missing values (if any) to ensure VAR can run
clpm_data = df_wide.dropna()

# Fit a Vector Autoregression (VAR) model as a basic CLPM
model = VAR(clpm_data)
results = model.fit(maxlags=2)

# Display results summary
print(results.summary())

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 24, Feb, 2025
Time:                     16:52:41
--------------------------------------------------------------------
No. of Equations:         12.0000    BIC:                   -29.4056
Nobs:                     64.0000    HQIC:                  -35.5387
Log likelihood:           475.066    FPE:                1.15545e-17
AIC:                     -39.5253    Det(Omega_mle):     2.20919e-19
--------------------------------------------------------------------
Results for equation 2020-07_BDI
                     coefficient       std. error           t-stat            prob
----------------------------------------------------------------------------------
const                   0.267052         0.662013            0.403           0.687
L1.2020-07_BDI         -0.045390         0.185613           -0.245           0.807
L1.2020-07_CFS          0.224448         0.121

  self._init_dates(dates, freq)


In [4]:
# Plotting significant cross-lagged effects (p-value < 0.05)
significant_effects = []
for i, equation in enumerate(results.params.columns):
    for j, param in enumerate(results.params.index):
        if results.pvalues.iloc[j, i] < 0.05:
            significant_effects.append((param, equation, results.params.iloc[j, i]))

print("\nSignificant Cross-Lagged Effects:")
for effect in significant_effects:
    print(f"{effect[0]} -> {effect[1]}: Coefficient = {effect[2]:.4f}")



Significant Cross-Lagged Effects:
L2.2020-10_BDI -> 2020-07_BDI: Coefficient = 0.5980
const -> 2020-07_CFS: Coefficient = -2.7265
L1.2020-09_LSAS -> 2020-07_CFS: Coefficient = 0.2492
L1.2020-09_LSAS -> 2020-08_BDI: Coefficient = -0.2415
L1.2020-09_LSAS -> 2020-08_CFS: Coefficient = -0.3639
L2.2020-09_BDI -> 2020-08_CFS: Coefficient = 0.5610
L2.2020-10_CFS -> 2020-08_CFS: Coefficient = 0.5885
L2.2020-10_LSAS -> 2020-08_CFS: Coefficient = -0.3499
L1.2020-07_LSAS -> 2020-08_LSAS: Coefficient = 0.6412
L1.2020-09_LSAS -> 2020-08_LSAS: Coefficient = -0.5896
L1.2020-08_CFS -> 2020-09_BDI: Coefficient = -0.2956
L2.2020-07_LSAS -> 2020-09_BDI: Coefficient = 0.2990
L2.2020-10_BDI -> 2020-09_BDI: Coefficient = -0.6605
L2.2020-09_BDI -> 2020-09_CFS: Coefficient = 0.8563
L2.2020-09_LSAS -> 2020-09_CFS: Coefficient = -0.2649
L2.2020-07_LSAS -> 2020-09_LSAS: Coefficient = 0.7787


In [5]:
# Aggregate by 'YearMonth', 'Participant_Number', and 'Question', taking the mean score
df_aggregated = df.groupby(['YearMonth', 'Participant_Number', 'Question'], as_index=False)['Score'].mean()

# Aggregate scores by test (mean score for each test: BDI, LSAS, CFS)
df_aggregated['Test'] = df_aggregated['Question'].str.extract(r'([A-Za-z]+)')
df_mean_scores = df_aggregated.groupby(['YearMonth', 'Participant_Number', 'Test'], as_index=False)['Score'].mean()

# Pivot the dataset to wide format for CLPM
df_wide = df_mean_scores.pivot_table(
    index='Participant_Number',
    columns=['YearMonth', 'Test'],
    values='Score'
)

# Ensure the columns are sorted chronologically by converting YearMonth to Period objects
sorted_columns = sorted(df_wide.columns, key=lambda x: (pd.Period(x[0], freq='M').year, pd.Period(x[0], freq='M').month))
df_wide = df_wide[sorted_columns]

# Display a preview of the wide-format dataset
df_wide.head()

# CLPM Analysis Setup

# Drop rows with missing values (if any) to ensure VAR can run
clpm_data = df_wide.dropna()

# Compute Variance Inflation Factor (VIF) to check for multicollinearity
vif_data = pd.DataFrame()
vif_data['Variable'] = clpm_data.columns.to_list()
vif_data['VIF'] = [variance_inflation_factor(clpm_data.values, i) for i in range(clpm_data.shape[1])]

print("Variance Inflation Factor (VIF) for each variable:")
print(vif_data)

# Fit a Vector Autoregression (VAR) model as a basic CLPM
model = VAR(clpm_data)
results = model.fit(maxlags=2)

# Display results summary
print(results.summary())

# Plotting significant cross-lagged effects (p-value < 0.05)
significant_effects = []
for i, equation in enumerate(results.params.columns):
    for j, param in enumerate(results.params.index):
        if results.pvalues.iloc[j, i] < 0.05:
            significant_effects.append((param, equation, results.params.iloc[j, i]))

print("\nSignificant Cross-Lagged Effects:")
for effect in significant_effects:
    print(f"{effect[0]} -> {effect[1]}: Coefficient = {effect[2]:.4f}")


Variance Inflation Factor (VIF) for each variable:
           Variable         VIF
0    (2020-07, BDI)    3.305463
1    (2020-07, CFS)  118.842295
2   (2020-07, LSAS)   11.664251
3    (2020-08, BDI)    3.181852
4    (2020-08, CFS)   86.142934
5   (2020-08, LSAS)    5.635171
6    (2020-09, BDI)    2.784495
7    (2020-09, CFS)   80.162656
8   (2020-09, LSAS)    7.145792
9    (2020-10, BDI)    4.775923
10   (2020-10, CFS)  184.599035
11  (2020-10, LSAS)   11.804024
  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 24, Feb, 2025
Time:                     16:52:42
--------------------------------------------------------------------
No. of Equations:         12.0000    BIC:                   -29.4056
Nobs:                     64.0000    HQIC:                  -35.5387
Log likelihood:           475.066    FPE:                1.15545e-17
AIC:                     -39.5253    Det(Omega_mle):     2.20919e-19
-------------

  self._init_dates(dates, freq)
