In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Load the dataset
file_path = r"C:\Users\manoj\OneDrive\Documents\Ind Stdy II\MIMIC\MIMIC SOLNS _stage1\newMimic\GitMimic1.csv"
data = pd.read_csv(file_path)

# Selecting features and target variable
features = ['AGE', 'ETHNICITY', 'INSURANCE', 'GENDER', 'FIRST_ADMIT_DAYS']
target = 'TOTAL_ADMITS'

# Splitting data into features and target variable
X = data[features]
y = data[target]

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding categorical features
categorical_features = ['ETHNICITY', 'INSURANCE', 'GENDER']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_features)],
    remainder='passthrough')

# Pipeline for preprocessing and logistic regression
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

# Fitting the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculating R squared score
r_squared = r2_score(y_test, y_pred)
print("R Squared Score:", r_squared)



import numpy as np
# Calculating accuracy with 10% tolerance
tolerance = 0.1
num_samples = len(y_test)
num_correct = np.sum(np.abs(y_test - y_pred) <= tolerance * y_test)
accuracy_tol = num_correct / num_samples
print("Accuracy with 10% tolerance:", accuracy_tol)


In [None]:
# Getting feature names after one-hot encoding
encoded_feature_names = pipeline.named_steps['preprocessor'] \
                                  .named_transformers_['cat'] \
                                  .get_feature_names_out(input_features=categorical_features)
feature_names = list(encoded_feature_names) + ['AGE', 'FIRST_ADMIT_DAYS']

# Getting logistic regression coefficients
log_reg_coef = pipeline.named_steps['classifier'].coef_[0]

# Creating a DataFrame to display feature names and their coefficients
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': log_reg_coef})
feature_importance_df['Absolute Coefficient'] = np.abs(feature_importance_df['Coefficient'])
feature_importance_df = feature_importance_df.sort_values(by='Absolute Coefficient', ascending=False)

print("Feature Importance:")
print(feature_importance_df)




In [None]:

import pandas as pd
import warnings
from dowhy import CausalModel
import matplotlib.pyplot as plt

# Suppress all warnings
warnings.filterwarnings("ignore")

# Load the merged data from the CSV file
data = pd.read_csv("NewDataset02.csv")

# Define the causal model with a graph, including an IV and a mediator
model = CausalModel(
    data=data,
    treatment='FIRST_ADMIT_DAYS',  # Treatment variable
    outcome='TOTAL_ADMITS',  # Outcome variable
    common_causes=['TOTAL_ADMIT_DAYS', 'INSURANCE', 'AGE', 'GENDER', 'ETHNICITY'],  # Common causes
    instruments=['IV_VAR'],  # Placeholder for an instrumental variable
    graph="digraph { \
            FIRST_ADMIT_DAYS -> TOTAL_ADMITS; \
            AGE -> TOTAL_ADMITS; AGE -> FIRST_ADMIT_DAYS; \
            INSURANCE -> TOTAL_ADMITS; INSURANCE -> FIRST_ADMIT_DAYS; \
            GENDER -> TOTAL_ADMITS; GENDER -> FIRST_ADMIT_DAYS; \
            ETHNICITY -> TOTAL_ADMITS; ETHNICITY -> FIRST_ADMIT_DAYS;}"
)

# Visualize the model (Graphviz must be installed)
model.view_model()
plt.show()

# Identify causal effect using the model
identified_estimand = model.identify_effect()

# Print the identified estimand equations
print("Identified Estimand:", identified_estimand)

# Estimate the causal effect using a statistical method
causal_estimate = model.estimate_effect(identified_estimand,
                                        method_name="backdoor.linear_regression")
print("Causal Estimate is:", causal_estimate.value)

# Refute the obtained estimate
refutation = model.refute_estimate(
    identified_estimand,
    causal_estimate,
    method_name="random_common_cause"
)
print(refutation)