In [2]:
%pip install dowhy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# imports
import networkx as nx 
import numpy as np
import pandas as pd
from dowhy import gcm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# loading the data 
depression = pd.read_csv("data/Student Depression Dataset.csv")
depression = depression.dropna()
depression = depression.replace({'Yes': 1, 'No': 0})

data_encoded = pd.get_dummies(depression, drop_first=True)
data_encoded = data_encoded[['Academic Pressure', 'Have you ever had suicidal thoughts ?',
 'Financial Stress', 'City_Ahmedabad', 'City_Bhopal', 'City_Faridabad',
 'City_Hyderabad', 'City_Meerut', 'City_Patna', 'Dietary Habits_Moderate',
 'Dietary Habits_Others', 'Dietary Habits_Unhealthy', 'Depression']]

depression_LASSO_features = data_encoded[['Academic Pressure', 'Have you ever had suicidal thoughts ?',
  'Financial Stress', 'City_Ahmedabad', 'City_Bhopal', 'City_Faridabad',
  'City_Hyderabad', 'City_Meerut', 'City_Patna', 'Dietary Habits_Moderate',
  'Dietary Habits_Others', 'Dietary Habits_Unhealthy', 'Depression']]

depression_LASSO_features = depression_LASSO_features.astype(int)

depression_LASSO_features.head()

  depression = depression.replace({'Yes': 1, 'No': 0})


Unnamed: 0,Academic Pressure,Have you ever had suicidal thoughts ?,Financial Stress,City_Ahmedabad,City_Bhopal,City_Faridabad,City_Hyderabad,City_Meerut,City_Patna,Dietary Habits_Moderate,Dietary Habits_Others,Dietary Habits_Unhealthy,Depression
0,5,1,1,0,0,0,0,0,0,0,0,0,1
1,2,0,2,0,0,0,0,0,0,1,0,0,0
2,3,0,1,0,0,0,0,0,0,0,0,0,0
3,3,1,5,0,0,0,0,0,0,1,0,0,1
4,4,1,1,0,0,0,0,0,0,1,0,0,0


In [5]:
depression_LASSO_features['Dietary Habits_Unhealthy'].sum()

10316

In [6]:
# X -> treatment (e.g., sleep duration, dietary habits)
# Z -> observation (e.g., depression)

# create "scoring" for dietary habits -> 0 for unhealthy, 
# 1 for moderate, 2 for other 

# add the ones = 1 from unhealthy,
# = 1 from moderate, and = 1 for others. before this, 
# encode them as above.
depression_LASSO_features['Dietary_Score'] = (
    depression_LASSO_features['Dietary Habits_Moderate'] * 1 +
    depression_LASSO_features['Dietary Habits_Others'] * 2 +
    depression_LASSO_features['Dietary Habits_Unhealthy'] * 0
)
X = depression_LASSO_features['Dietary_Score']

city_columns = ['City_Ahmedabad', 'City_Bhopal', 'City_Faridabad',
                'City_Hyderabad', 'City_Meerut', 'City_Patna']

depression_LASSO_features['City_Index'] = depression_LASSO_features[city_columns].idxmax(axis=1)
depression_LASSO_features['City_Index'] = depression_LASSO_features['City_Index'].apply(lambda x: 
                                                                                        city_columns.index(x) + 1)

Y = depression_LASSO_features['City_Index']
Z = depression_LASSO_features['Depression']

### Possible counterfactual questions:
1. What would happen to Depression (Z), if i intervened on someone's Financial Sress (X)?
2. What would happen to Depression (Z), if i intervened on someone's Academic Pressure (X)? e.g., if I reduce someone's academic pressure by 30%, would the likelihood of depression decrease? 
3. What would happen to Depression (Z), if i intervened on someone's Academix Pressure (X)? 


### X -> Academic Pressure, Z -> Depression

- intervene on Academic Pressure, decrease it by 30%. What do we observe about Depression?

In [14]:
depression_LASSO_features.head()

Unnamed: 0,Academic Pressure,Have you ever had suicidal thoughts ?,Financial Stress,City_Ahmedabad,City_Bhopal,City_Faridabad,City_Hyderabad,City_Meerut,City_Patna,Dietary Habits_Moderate,Dietary Habits_Others,Dietary Habits_Unhealthy,Depression,Dietary_Score,City_Index
0,5,1,1,0,0,0,0,0,0,0,0,0,1,0,1
1,2,0,2,0,0,0,0,0,0,1,0,0,0,1,1
2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,3,1,5,0,0,0,0,0,0,1,0,0,1,1,1
4,4,1,1,0,0,0,0,0,0,1,0,0,0,1,1


In [35]:
# Set the treatment variable (X) to be Academic Pressure;
# this is what we are intervening on.
X = depression_LASSO_features['Academic Pressure']
Z = depression_LASSO_features['Depression']

In [37]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([('X', 'Z')])) # X -> Z
causal_model.set_causal_mechanism('X', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Z', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Training data for the model,
# X -> Academic Pressure, Z -> Depression
training_data = pd.DataFrame(data=dict(X=X, Z=Z))

# fit the model to the training data
gcm.fit(causal_model, training_data)

gcm.counterfactual_samples( # generate counterfactual samples
    causal_model,
    {'X': lambda x: 5}, # intervene on Academic Pressure
    observed_data=pd.DataFrame(data=dict(X=[1], Z=[2])))

Fitting causal mechanism of node Z: 100%|██████████| 2/2 [00:00<00:00, 41.99it/s]


Unnamed: 0,X,Z
0,5,2.677259


In [4]:
depression_LASSO_features.describe()

Unnamed: 0,Academic Pressure,Have you ever had suicidal thoughts ?,Financial Stress,City_Ahmedabad,City_Bhopal,City_Faridabad,City_Hyderabad,City_Meerut,City_Patna,Dietary Habits_Moderate,Dietary Habits_Others,Dietary Habits_Unhealthy,Depression
count,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0,27898.0
mean,3.141336,0.632877,3.139867,0.034088,0.033479,0.016524,0.047996,0.029572,0.036096,0.355617,0.00043,0.369776,0.585526
std,1.381462,0.482029,1.437347,0.18146,0.179887,0.127483,0.213762,0.169406,0.186532,0.478708,0.020736,0.482752,0.49264
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,4.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
max,5.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Financial Stress, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Academic_Pressure': lambda x: 4}, # intervene on Academic Pressure, set to 4
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1], # fix Financial Stress
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[5],
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 26.04it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,1,1,4,0.88136


In [35]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Financial Stress, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Academic_Pressure': lambda x: 1}, # intervene on Academic Pressure, set to 1
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1], # fix Financial Stress
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[5],
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 41.92it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,1,1,1,0.525441


- what we observed: Depression was 1.0 when Academic Pressure was 5, Suicidal Thoughts was 1 and Financial Stress was 1.
- When intervening on Academc Pressure while keeping the other values fixed:
    - AP = 4, Depression = 0.88136
    - AP = 1, Depression = 0.525441

**X -> Financial Stress, Z -> Depression**

In [36]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Academic Pressure, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Financial_Stress': lambda x: 4}, # intervene on Financial Stress, set to 4
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1],
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[5], # fix Academic Pressure
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 75.34it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,4,1,5,1.233391


In [38]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Academic Pressure, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Financial_Stress': lambda x: 2}, # intervene on Financial Stress, set to 2
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1],
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[5], # fix Academic Pressure
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 53.74it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,2,1,5,1.077797


In [39]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Academic Pressure, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Financial_Stress': lambda x: 5}, # intervene on Financial Stress, set to 5
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1],
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[5], # fix Academic Pressure
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 21.55it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,5,1,5,1.311188


- fixed Academic Pressure at 5, Suicidal Thoughts at 1.
- When intervening on Financial Stress while keeping the other values fixed:
    - FS = 5, Depression = 1.311188
    - FS = 4, Depression = 1.233391
    - FS = 2, Depression = 1.077797

In [43]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Academic Pressure, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Financial_Stress': lambda x: 3}, # intervene on Financial Stress, set to 5
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1],
        Suicidal_Thoughts=[1], # fix Suicidal Thoughts
        Academic_Pressure=[2], # fix Academic Pressure (changed to 2)
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 46.06it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,3,1,2,1.155594


In [44]:
# Construct the Graph, set up the model
causal_model = gcm.InvertibleStructuralCausalModel(nx.DiGraph([
                                    ('Financial_Stress', 'Depression'),
                                    ('Suicidal_Thoughts', 'Depression'),
                                    ('Academic_Pressure', 'Depression')
                                    ])
                                    )

causal_model.set_causal_mechanism('Financial_Stress', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Suicidal_Thoughts', 
                                  gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Academic_Pressure', gcm.EmpiricalDistribution())
causal_model.set_causal_mechanism('Depression', 
                                  gcm.AdditiveNoiseModel(
                                      gcm.ml.create_linear_regressor()))

# Fix Academic Pressure, Suicidal Thoughts
training_data = pd.DataFrame(data=dict(
    **{'Financial_Stress': depression_LASSO_features['Financial Stress']},
    **{'Suicidal_Thoughts': depression_LASSO_features['Have you ever had suicidal thoughts ?']},
    **{'Academic_Pressure': depression_LASSO_features['Academic Pressure']},
    **{'Depression': depression_LASSO_features['Depression']}
    ))

# print(training_data.head())
# print("Columns in training_data:", training_data.columns)

# fit the model to the training data
gcm.fit(causal_model, training_data)

# generate counterfactual samples 
counterfactual_result = gcm.counterfactual_samples(
    causal_model,
    {'Suicidal_Thoughts': lambda x: 0}, # intervene on Suicidal Thoughts, set to 0
    observed_data=pd.DataFrame(data=dict(
        Financial_Stress=[1], # fix Financial Stress
        Suicidal_Thoughts=[1], 
        Academic_Pressure=[5], # fix Academic Pressure 
        Depression=[1]))) 

counterfactual_result

Fitting causal mechanism of node Academic_Pressure: 100%|██████████| 4/4 [00:00<00:00, 30.45it/s]


Unnamed: 0,Financial_Stress,Suicidal_Thoughts,Academic_Pressure,Depression
0,1,0,5,0.579094


- Suicidal Thoughts intervention -> set to 0, 
- fix rest: academic pressure = 5, financial stress = 1
- result: depression decreased to 0.579094