In [None]:
%matplotlib inline
import pandas as pd
import os
from sklearn import datasets
import matplotlib.pyplot as plt
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

from utils import * 
from causalDataframe import *
from causal_discovery.ncda import * 
from baselines.correlations import * 
from data_generation.randomDataframe import *
from data_generation.relatedDataframe import * 
from data_generation.evaluation.kde import *
from data_generation.evaluation.lof import *
from sdv.tabular import CTGAN
from sdv.tabular import TVAE

In [None]:
path = '/Users/martina/Desktop/'

# DATASETS

**1# Option**: From sklearn

In [None]:
"""
A possible Ground Truth 

['bmi' --> 'si', 'bmi' --> 'bp', 'si' --> 'bp']

"""

diabetes = datasets.load_diabetes()
dataframe = diabetes['data'][:, [0, 1, 2, 3, 4]]
data = pd.DataFrame(dataframe, columns=['age', 'sex', 'bmi', 'bp', 'si'])

**2# Option**: From project folder

In [None]:
"""
Avaiable datasets 

Abalone            Ground Truth: ['Rings' --> 'Length']
Old Faithful       Ground Truth: ['Time Interval' --> 'Duration']
Climate            Ground Truth: ['Altitude' --> 'Temperature']
Undata             Ground Truth: ['Female Age' <-- 'Latitude']
Synthetic          Ground Truth: ['w' --> 'x', 'w' --> 'y', 'x' --> 'z', 'y' --> z]

"""
directory = os.path.abspath('')
file_path = os.path.join(os.path.dirname(directory), 'GENCDA', 'datasets', 'synthetic.csv')
data = pd.read_csv(file_path)

**3# Option**: Create a new dataframe from a random DAG

In [None]:
# Create a new graph with 5 nodes and 2 edges
# This function returns also a json file. It contains all the information about the dag.
graph = randomDag(path, 4, 2)
print(f'Nodes: {graph.nodes} \nEdges: {graph.edges}')
n_samples = 1500

# Instantiate main class with the number of samples and the dag
d = CausalDataFrame(n_samples, graph)

In [None]:
# Generate a dataset based on the graph structure
d.generate_data()
# Get as a dataframe
data = d.dataframe
# Save the new dataframe
data.to_csv(os.path.join(path, 'dataset.csv'), index=False)
# Show the generated random dag
d.show_graph()

data.head()


**4# Option**: Create a new dataframe from a known graph

In [None]:
edges = [('0', '2'), ('1', '2'), ('2', '7'), ('1', '4'), ('4', '8')]
isolated_nodes = ['3', '5', '6']
n_samples = 1500

# Instantiate main class with the number of samples and the dag
d = CausalDataFrame(n_samples, edges, isolated_nodes)
# Generate a dataset based on the graph structure
d.generate_data()

In [None]:
# Get as a dataframe
data = d.dataframe
# Save the new dataframe
data.to_csv(os.path.join(path, 'dataset.csv'), index=False)
# Show the generated random dag
d.show_graph()

data.head()

## DAG DESCRIBING THE CAUSAL STRUCTURE OF THE DATA

**1# Option**: Based on a known ground truth

In [None]:
"""
Example: Synthetic Dataset

"""
graph = nx.DiGraph()
graph.add_nodes_from(['w', 'y', 'x', 'z'])
graph.add_edges_from([('w', 'x'), ('w', 'y'), ('x', 'z'), ('y', 'z')])
nx.draw_networkx(graph, node_size=1500, font_color='w', font_size=16)
plt.show()

**2# Option**: From a random graph

In [None]:
graph = randomDag(path, 6, 4)
nx.draw_networkx(graph, node_size=1500, font_color='w', font_size=16)
plt.show()

# NCDA - Nonlinear Causal Discovery with Apriori

In [None]:
# Instantiate main class
ncda = NCDApriori(data, graph)

In [None]:
# Fit Apriori 

"""
Since our method works on continuos dataset, we discretize our dataframe to apply the pattern mining algorithm. 

"""

itemsets, performance = ncda.fitApriori(target='m', zmax=3, nbins=4, strategy='quantile', support=5)

In [None]:
print(f'Maximal itemsets found by Apriori: \n\n{itemsets}')

In [None]:
"""
Evaluation of the number of relations found comparing frequent items found by apriori and the dag. 
In this case, we verify relations on an undirect acyclic graph 
since we want to know if apriori detects any relationship. Then, edges directions are not relevant.

"""

performance

In [None]:
# Fit Causal Discovery Algorithm implemented by Hoyer et al.

causal_relations = ncda.fitNCD(itemsets, alpha=0.001, sorting=np.mean, train_size=0.7, standardization=True)
causal_relations

In [None]:
"""
Evaluation of the number of relations found comparing causal relationship found by NCD and the ground truth. 
In this case, we verify relations on an direct acyclic graph.
""" 

# We have to transform dag edges as strings
edges = [(str(source), str(destination)) for source, destination in graph.edges]

precision, recall, accuracy, f1 = evaluate(causal_relations, edges, graph)
print(f'Precision: {precision}\nRecall: {recall}\nAccuracy: {accuracy}\nF1: {f1}')

# CAUSAL DISCOVERY BASELINES: CORRELATIONS 

Possibile correlation coefficients to check: Pearson, Spearman, Hoeffding

In [None]:
# main class
corr = Correlation()

Compute test statistic **between two variables**

In [None]:
r, p_value = corr.pearson(data.iloc[:, 0], data.iloc[:, 1])
r, p_value

Compute **pairwise correlation of columns**

In [None]:
new_df = corr.pairwise(data, pearsonr)
columns, confusion_matrix = corr.evaluate(new_df, graph)
print(f'List of indices of column pairs that are correlated: {columns}')

In [None]:
"""
Evaluation of the number of relations found comparing relations found by correlation metric and the ground truth. 
In this case, we verify relations on an undirect acyclic graph.

"""
precision, recall, accuracy, f1 = evaluation_measures(confusion_matrix)
print(f'Precision: {precision}\nRecall: {recall}\nAccuracy: {accuracy}\nF1: {f1}')

# SYNTHETIC DATA GENERATOR

A synthetic dataset generator for tabular data that is able to discover the nonlinear causalities 
among the variables and use them at generation time. 

In [None]:
# Generate a DAG from causal relationships founded by NCDA
dag_ncda = nx.to_networkx_graph(causal_relations, create_using=nx.DiGraph)

# Add isolated nodes. We select them from dataset columns
dag_nodes = list(data.columns)
dag_ncda.add_nodes_from(dag_nodes)

nx.draw_networkx(dag_ncda, node_size=1500, font_color='w', font_size=16)
plt.show()

In [None]:
# Instantiate main class
generator = RelatedDataframe(data, dag_ncda)

# Generate new data based on ground truth dag 
gencda_data = generator.generate_data()

# Show new dataframe 
gencda_data.head()

# SYNTHETIC DATA GENERATION BASELINES

1# Baseline: **RANDOM GENERATION**

In [None]:
random_data = randomDataframe(data)

2# Baseline: **CTGAN**

In [None]:
"""
From Synthetic Data Vault Library (SDV)
https://sdv.dev/SDV/api_reference/tabular/api/sdv.tabular.ctgan.CTGAN.html

"""
# Instantiate main class
model = CTGAN()

# Fit CTGAN
model.fit(data)

# Save model
model.save(os.path.join(path, 'ctgan_model.pkl'))

# Generate new data
ctgan_data = model.sample(len(data))

# Save new dataframe 
ctgan_data.to_csv(os.path.join(path, 'ctgan_dataset.csv'), index=False)

3# Baseline: **TVAE**

In [None]:
"""
From SDV Library 
https://sdv.dev/SDV/api_reference/tabular/api/sdv.tabular.ctgan.TVAE.html

"""
# To apply tvae, the svd library requires columns names as strings 
if data.columns.dtype != 'str':
    data.columns = data.columns.astype(str)

# Instantiate main class
model = TVAE()

# Fit TVAE
model.fit(data)

# Save model
model.save(os.path.join(path, 'tvae_model.pkl'))

# Generate new data
tvae_data = model.sample(len(data))

# Save new dataframe 
tvae_data.to_csv(os.path.join(path, 'tvae_dataset.csv'), index=False)

# EVALUATION MEASURE

1) Compute SSE, RMSE using KERNEL DENSITY ESTIMATION

In [None]:
print(f'GENDATA {get_statistics(data, gencda_data)}')

print('\n')
print(f'RANDOM {get_statistics(data, random_data)}')

print('\n')
print(f'CTGAN {get_statistics(data, ctgan_data)}')

print('\n')
print(f'TVAE {get_statistics(data, tvae_data)}')


2) Local Outlier Factor

In [None]:
print(f'GENDATA {lof(data, gencda_data, n_neighbors=50)}')

print('\n')
print(f'RANDOM {lof(data, random_data, n_neighbors=50)}')

print('\n')
print(f'CTGAN {lof(data, ctgan_data, n_neighbors=50)}')

print('\n')
print(f'TVAE {lof(data, tvae_data, n_neighbors=50)}')


# PLOT 

In [None]:
# Set index of column to plot
index = 0

datasets = [gencda_data, random_data, ctgan_data, tvae_data]
names = ['GENCDA', 'RANDOM', 'CTGAN', 'TVAE']

for dataset, name in zip(datasets, names):
    plotKDE(data.iloc[:, index], dataset.iloc[:, index], label1='Original Data', label2=name)
plt.show()

In [None]:
# Set index of column to plot
index = [0, 1]

datasets = [gencda_data, random_data, ctgan_data, tvae_data]
names = ['GENCDA', 'RANDOM', 'CTGAN', 'TVAE']

for dataset in [gencda_data, random_data, ctgan_data, tvae_data]:
    plotLOF(data.iloc[:, index], dataset.iloc[:, index])
    plt.show()
