In [None]:
import grapl.algorithms as algs
import grapl.dsl as dsl
import pandas as pd
import numpy as np
from scipy.sparse._data import _data_matrix

causal_graph = '"Alcohol Consumption Causal Graph"; \
                D; F; S; A; P; M; N; G; H; \
                F -> D; \
                S -> D; \
                A -> D; \
                P -> D; \
                M -> D; \
                N -> D; \
                P -> F; \
                M -> F; \
                N -> F; \
                A -> H; \
                H -> D; \
                H -> G; \
                G -> D; '

grapl_obj = dsl.GraplDSL()
G = grapl_obj.readgrapl(causal_graph)
# Dalc(D); Famrel(F); Sex(S); Age(A); Pstatus(P); Medu(M); Fedu(N); Mjob(J); Fjob(K); Goout(G); Health(H)

In [None]:
id_str, id_eqn, isident = algs.idfixing(G, {'A'}, {'D'})

In [4]:
from sklearn.preprocessing import OrdinalEncoder
import os
from sklearn.model_selection import train_test_split

# Load the dataset
total_data = pd.read_csv("./student-encoded.csv")
# total_data.drop(labels = ['goout', 'health'], axis=1)
# Specify the desired features and target column
desired_features = ['sex', 'age', 'Pstatus','Medu','Fedu','famrel' , 'goout', 'health', 'Dalc','Walc']
df = total_data[desired_features].copy()

# df = pd.get_dummies(df, columns=['Mjob', 'Fjob'])

# target_column = df['Walc']
target_column = df['Dalc']
df

Unnamed: 0,sex,age,Pstatus,Medu,Fedu,famrel,goout,health,Dalc,Walc
0,0.0,18,0.0,4,4,4,4,3,1,1
1,0.0,17,1.0,1,1,5,3,3,1,1
2,0.0,15,1.0,1,1,4,2,3,2,3
3,0.0,15,1.0,4,2,3,2,5,1,1
4,0.0,16,1.0,3,3,4,2,5,1,2
...,...,...,...,...,...,...,...,...,...,...
1039,0.0,19,1.0,2,3,5,2,5,1,2
1040,0.0,18,1.0,3,1,4,4,1,1,1
1041,0.0,18,1.0,1,1,1,1,5,1,1
1042,1.0,17,1.0,3,1,2,5,2,3,4


In [5]:
# Create a file to save the split data
output_dir = "./split_data/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Iterate over each feature and split the dataset
for column in df.columns:
    # Get the data for the current feature
    feature_data = df[[column]]
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_column, test_size=0.2, random_state=42)
    
    # Save the split training and testing sets as CSV files
    train_filename = os.path.join(output_dir, f"{column}_train.csv")
    test_filename = os.path.join(output_dir, f"{column}_test.csv")
    X_train.to_csv(train_filename, index=False)
    X_test.to_csv(test_filename, index=False)
    
    # Print the sizes of training and testing sets for each feature
    print(f"Feature: {column}")
    print(f"Training set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")
    print("-----------")

Feature: sex
Training set size: 835
Testing set size: 209
-----------
Feature: age
Training set size: 835
Testing set size: 209
-----------
Feature: Pstatus
Training set size: 835
Testing set size: 209
-----------
Feature: Medu
Training set size: 835
Testing set size: 209
-----------
Feature: Fedu
Training set size: 835
Testing set size: 209
-----------
Feature: famrel
Training set size: 835
Testing set size: 209
-----------
Feature: goout
Training set size: 835
Testing set size: 209
-----------
Feature: health
Training set size: 835
Testing set size: 209
-----------
Feature: Dalc
Training set size: 835
Testing set size: 209
-----------
Feature: Walc
Training set size: 835
Testing set size: 209
-----------


In [7]:
testdata_dir = "./split_data/"
D_train = pd.read_csv(testdata_dir + "Dalc_train.csv")
# D_train = pd.read_csv(testdata_dir + "Walc_train.csv")
F_train = pd.read_csv(testdata_dir + "famrel_train.csv")
S_train = pd.read_csv(testdata_dir + "sex_train.csv")
A_train = pd.read_csv(testdata_dir + "age_train.csv")
P_train = pd.read_csv(testdata_dir + "Pstatus_train.csv")
M_train = pd.read_csv(testdata_dir + "Medu_train.csv")
N_train = pd.read_csv(testdata_dir + "Fedu_train.csv")
H_train = pd.read_csv(testdata_dir + "health_train.csv")
G_train = pd.read_csv(testdata_dir + "goout_train.csv")

# Jathome_train = pd.read_csv(testdata_dir + "Mjob_at_home_train.csv")
# Jhealth_train = pd.read_csv(testdata_dir + "Mjob_health_train.csv")
# Jother_train = pd.read_csv(testdata_dir + "Mjob_other_train.csv")
# Jservices_train = pd.read_csv(testdata_dir + "Mjob_services_train.csv")
# Jteacher_train = pd.read_csv(testdata_dir + "Mjob_teacher_train.csv")
# Kathome_train = pd.read_csv(testdata_dir + "Fjob_at_home_train.csv")
# Khealth_train = pd.read_csv(testdata_dir + "Fjob_health_train.csv")
# Kother_train = pd.read_csv(testdata_dir + "Fjob_other_train.csv")
# Kservices_train = pd.read_csv(testdata_dir + "Fjob_services_train.csv")
# Kteacher_train = pd.read_csv(testdata_dir + "Fjob_teacher_train.csv")

# J_train = pd.concat([Jathome_train, Jhealth_train, Jother_train, Jservices_train, Jteacher_train], axis=1)
# K_train = pd.concat([Kathome_train, Khealth_train, Kother_train, Kservices_train, Kteacher_train], axis=1)

# Reform the data to the acceptable format for the causalbootstrapping interfaces
D_train = np.array(D_train)
F_train = np.array(F_train)
S_train = np.array(S_train)
A_train = np.array(A_train)
P_train = np.array(P_train)
M_train = np.array(M_train)
N_train = np.array(N_train)
# J_train = np.array(J_train)
# K_train = np.array(K_train)
H_train = np.array(H_train)
G_train = np.array(G_train)

data = {"D'": D_train,
        "F": F_train,
        "S": S_train,
        "A": A_train,
        "P": P_train,
        "M": M_train,
        "N": N_train,
        "H": H_train,
        "G": G_train}

In [8]:
id_str

"p_{A}(D)=\\sum_{N,G,H,F,P',N',P,M,A',S}[p(G,H,A,D,S|N,P,M,F)p(G,A',H,S|P',N,M,F)p(G,S,H,A|F,N',P,M)p(G,H,A|N,P,M,F)p(N,G,P,M,S|H,A)p(N,F,P,A,S|M)p(G,N,P,M|H,A)p(N,P,M|H,A)p(N,M|P)p(P',N|M)p(N,M|P)p(N',M|P)p(H)p(H)p(A')p(H)p(H)p(H)/p(G,H)p(G,H)p(A',H)p(G,H)p(G,H)p(H,A)p(H,A)p(G,H)p(A)p(S)p(N)p(M)p(N)p(P)p(N)p(A)p(M)p(S)p(M)p(N)p(N)p(S)p(P)p(S)p(M)p(N)p(P)p(M)]"

In [9]:
prob = list(set(id_str.split('}')[2][1:-1].split('p')[1:]))
prob

['(G,H,A|N,P,M,F)',
 '(G,N,P,M|H,A)',
 '(H)',
 "(P',N|M)",
 "(G,S,H,A|F,N',P,M)",
 '(N,P,M|H,A)',
 "(A')",
 '(N,M|P)',
 '(N)',
 '(N,F,P,A,S|M)',
 '(N,G,P,M,S|H,A)',
 "(N',M|P)",
 '(H)/',
 '(A)',
 '(G,H)',
 '(P)',
 '(S)',
 "(G,A',H,S|P',N,M,F)",
 '(G,H,A,D,S|N,P,M,F)',
 "(A',H)",
 '(M)',
 '(H,A)']

In [10]:
prob_mod = prob.copy()
print(len(prob_mod), len(prob))
for i in range(len(prob)):
    if "'" in prob_mod[i]:
        prob_mod[i] = prob_mod[i].replace("'", '')
print(len(prob_mod), len(prob))
print(prob_mod)
print(prob)

22 22
22 22
['(G,H,A|N,P,M,F)', '(G,N,P,M|H,A)', '(H)', '(P,N|M)', '(G,S,H,A|F,N,P,M)', '(N,P,M|H,A)', '(A)', '(N,M|P)', '(N)', '(N,F,P,A,S|M)', '(N,G,P,M,S|H,A)', '(N,M|P)', '(H)/', '(A)', '(G,H)', '(P)', '(S)', '(G,A,H,S|P,N,M,F)', '(G,H,A,D,S|N,P,M,F)', '(A,H)', '(M)', '(H,A)']
['(G,H,A|N,P,M,F)', '(G,N,P,M|H,A)', '(H)', "(P',N|M)", "(G,S,H,A|F,N',P,M)", '(N,P,M|H,A)', "(A')", '(N,M|P)', '(N)', '(N,F,P,A,S|M)', '(N,G,P,M,S|H,A)', "(N',M|P)", '(H)/', '(A)', '(G,H)', '(P)', '(S)', "(G,A',H,S|P',N,M,F)", '(G,H,A,D,S|N,P,M,F)', "(A',H)", '(M)', '(H,A)']


In [11]:
print(len(prob_mod), prob_mod)
for i in range(len(prob_mod)):
    if '|' in prob_mod[i]:
        aux = prob_mod[i]
        prob_mod[i] = aux.split('|')[0] + ',' + aux.split('|')[1]
        denom = '(' + aux.split('|')[1]
        # elem_ind = prob_mod.index(prob_mod[i])
        # prob_mod.append(num)
        prob_mod.append(denom)
        # prob_mod.pop(elem_ind)
print(len(prob_mod), prob_mod)

22 ['(G,H,A|N,P,M,F)', '(G,N,P,M|H,A)', '(H)', '(P,N|M)', '(G,S,H,A|F,N,P,M)', '(N,P,M|H,A)', '(A)', '(N,M|P)', '(N)', '(N,F,P,A,S|M)', '(N,G,P,M,S|H,A)', '(N,M|P)', '(H)/', '(A)', '(G,H)', '(P)', '(S)', '(G,A,H,S|P,N,M,F)', '(G,H,A,D,S|N,P,M,F)', '(A,H)', '(M)', '(H,A)']
33 ['(G,H,A,N,P,M,F)', '(G,N,P,M,H,A)', '(H)', '(P,N,M)', '(G,S,H,A,F,N,P,M)', '(N,P,M,H,A)', '(A)', '(N,M,P)', '(N)', '(N,F,P,A,S,M)', '(N,G,P,M,S,H,A)', '(N,M,P)', '(H)/', '(A)', '(G,H)', '(P)', '(S)', '(G,A,H,S,P,N,M,F)', '(G,H,A,D,S,N,P,M,F)', '(A,H)', '(M)', '(H,A)', '(N,P,M,F)', '(H,A)', '(M)', '(F,N,P,M)', '(H,A)', '(P)', '(M)', '(H,A)', '(P)', '(P,N,M,F)', '(N,P,M,F)']


In [12]:
prob_mod = list(set(prob_mod))
prob_mod

['(N,F,P,A,S,M)',
 '(G,H,A,N,P,M,F)',
 '(H)',
 '(N,M,P)',
 '(A,H)',
 '(G,N,P,M,H,A)',
 '(N)',
 '(G,A,H,S,P,N,M,F)',
 '(P,N,M,F)',
 '(F,N,P,M)',
 '(N,P,M,H,A)',
 '(G,S,H,A,F,N,P,M)',
 '(H)/',
 '(A)',
 '(G,H)',
 '(P)',
 '(S)',
 '(P,N,M)',
 '(M)',
 '(N,G,P,M,S,H,A)',
 '(H,A)',
 '(G,H,A,D,S,N,P,M,F)',
 '(N,P,M,F)']

In [20]:
import causalBootstrapping as cb
from distfit import distfit
from distEst_lib import MultivarContiDistributionEstimator
dict_prob = {}
for i in range(len(prob_mod)):
    print(prob_mod[i])
    n_bins = []
    vars = ''
    trains = ''
    variables = prob_mod[i].split('(')[1].split(')')[0].split(',')
    # print(len(variables))
    for i in range(len(variables)):
        if variables[i] in ['J', 'K']:
            n_bins.extend([0,0,0,0,0])
        elif variables[i] in ['A']:
            n_bins.append(3)
        else:
            n_bins.append(0)
            
        if i+1 == len(variables):
            trains += variables[i] + '_train' 
        else:
            trains += variables[i] + '_train,' 
        vars += variables[i]
    
    if len(variables) > 1:
        data = np.concatenate(((eval(trains))), axis=1)
    else:
        data = eval(trains)
    # print(data)
    # print(n_bins)
    # dist = distfit()
    # dist.fit_transform(data)
    # print(dist.summary)
    dist = MultivarContiDistributionEstimator(data_fit=data, n_bins = n_bins)
    exec("pdf_" + vars + ", p" + vars + "= dist.fit_histogram()")
    dict_prob[vars] = dist.fit_histogram()
    # break

(N,F,P,A,S,M)
(G,H,A,N,P,M,F)
(H)
(N,M,P)
(A,H)
(G,N,P,M,H,A)
(N)
(G,A,H,S,P,N,M,F)
(P,N,M,F)
(F,N,P,M)
(N,P,M,H,A)
(G,S,H,A,F,N,P,M)
(H)/
(A)
(G,H)
(P)
(S)
(P,N,M)
(M)
(N,G,P,M,S,H,A)
(H,A)
(G,H,A,D,S,N,P,M,F)
(N,P,M,F)


In [14]:
id_str

"p_{A}(D)=\\sum_{N,G,H,F,P',N',P,M,A',S}[p(G,H,A,D,S|N,P,M,F)p(G,A',H,S|P',N,M,F)p(G,S,H,A|F,N',P,M)p(G,H,A|N,P,M,F)p(N,G,P,M,S|H,A)p(N,F,P,A,S|M)p(G,N,P,M|H,A)p(N,P,M|H,A)p(N,M|P)p(P',N|M)p(N,M|P)p(N',M|P)p(H)p(H)p(A')p(H)p(H)p(H)/p(G,H)p(G,H)p(A',H)p(G,H)p(G,H)p(H,A)p(H,A)p(G,H)p(A)p(S)p(N)p(M)p(N)p(P)p(N)p(A)p(M)p(S)p(M)p(N)p(N)p(S)p(P)p(S)p(M)p(N)p(P)p(M)]"

In [None]:
np.dot(pGHADSNPMF,pGAHSPNMF)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1131dc750>>
Traceback (most recent call last):
  File "/Users/monica/Desktop/DSGP8/venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1131dc750>>
Traceback (most recent call last):
  File "/Users/monica/Desktop/DSGP8/venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [16]:
dict_prob.keys()

dict_keys(['NFPASM', 'GHANPMF', 'H', 'NMP', 'AH', 'GNPMHA', 'N', 'GAHSPNMF', 'PNMF', 'FNPM', 'NPMHA', 'GSHAFNPM', 'A', 'GH', 'P', 'S', 'PNM', 'M', 'NGPMSHA', 'HA', 'GHADSNPMF', 'NPMF'])

In [19]:
prob_mod

['(N,F,P,A,S,M)',
 '(G,H,A,N,P,M,F)',
 '(H)',
 '(N,M,P)',
 '(A,H)',
 '(G,N,P,M,H,A)',
 '(N)',
 '(G,A,H,S,P,N,M,F)',
 '(P,N,M,F)',
 '(F,N,P,M)',
 '(N,P,M,H,A)',
 '(G,S,H,A,F,N,P,M)',
 '(H)/',
 '(A)',
 '(G,H)',
 '(P)',
 '(S)',
 '(P,N,M)',
 '(M)',
 '(N,G,P,M,S,H,A)',
 '(H,A)',
 '(G,H,A,D,S,N,P,M,F)',
 '(N,P,M,F)']

In [ ]:
# dist = MultivarContiDistributionEstimator(data_fit=data, n_bins = n_bins)
    # var_name = "data_" + vars
    # dist_name = "dist_estimator_" + vars
    # print(trains)
    # if len(variables) == 1:
    #     print(variables)
    #     code = "data_" + vars + " = " + trains + '\n'
    # else:
    #     code = "data_" + vars + " = np.concatenate((" + trains + '), axis = 1) \n'
    # 
    # code += "dist_estimator_" + vars + " = MultivarContiDistributionEstimator(data_fit=" + var_name + ",n_bins = " + str(n_bins) + ') \n'
    # 
    # code += "pdf_" + vars + ", p" + vars + "= " + dist_name + ".fit_histogram()"
    # exec(code)
    # print(code)