In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pgmpy
import xgboost

In [None]:
from synthsonic.models.kde_utils import kde_smooth_peaks_1dim, kde_smooth_peaks
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from sklearn.model_selection import train_test_split

from pgmpy.models import BayesianModel
from pgmpy.estimators import TreeSearch, HillClimbSearch, BicScore, ExhaustiveSearch, BayesianEstimator
from pgmpy.sampling import BayesianModelSampling

In [None]:
train = pd.read_csv("train.csv", index_col ='id')
test = pd.read_csv("test.csv", index_col ='id')
df = pd.concat([train, test])

# data cleaning and prep

In [None]:
print(f"""
General info:
shape of data:
training set: {train.shape} (includes target column)
testing set: {test.shape}
total: {df.shape}

Checking imbalance in target:
{train.target.value_counts()}

""")

In [None]:
drop_missing = []

for col in df.columns : 
    missing = df[df[col] == -1][col].count()
    if missing > 0 :
        mperc = missing / df.shape[0]
        
        if mperc > 0.15 :
            drop_missing.append(col)
        
        print(f"variable {col} has {missing} missing values. ({mperc*100}%)")

In [None]:
df.drop(drop_missing, inplace=True, axis=1)
df.shape

In [None]:
# data seperation

meta = []

for col in df.columns :
    
    if col == 'target' :
        var_type = 'target'
    
    elif 'cat' in col or 'bin' in col or df[col].dtype == 'int' :
        var_type = 'categorical'
    else :
        var_type = 'numeric'

    coldict = {
        'colname':col,
        'var_type': var_type,
        'dtype': df[col].dtype
    }
    
    meta.append(coldict)
    
meta = pd.DataFrame(meta)
meta.set_index('colname', inplace=True)
meta.groupby('var_type').size()

In [None]:
meta 

 # Divide into numerical v. categorical training and test

In [None]:
# numerical
numeric_vars = meta.loc[meta['var_type'] == 'numeric'].index
numeric_train = train[numeric_vars]
numeric_test = test[numeric_vars]

In [None]:
print(numeric_vars)

In [None]:
# categorical
cat_vars = meta.loc[meta['var_type'] == 'categorical'].index
cat_train = train[cat_vars]
cat_test = test[cat_vars]

In [None]:
target = train['target']

# numerical features

In [None]:
numeric_train

# sample for quicker testing
numeric_train = numeric_train.sample(frac=0.1, random_state=1)
numeric_train

In [None]:
# only keep true continuous numerical features
numeric_train = numeric_train.iloc[:,2:6]
numeric_train

In [None]:
kde = KDECopulaNNPdf(use_KDE=False, clf=xgboost.XGBClassifier())

In [None]:
kde = kde.fit(numeric_train)

In [None]:
X_g = kde.pipe_[0].transform(numeric_train)
X_g

In [None]:
plt.hist(X_g[:,1], bins = 50)

In [None]:
X_p = kde.pipe_[0:2].transform(numeric_train)
plt.hist(X_p[:, 0], bins=50)

In [None]:
X_u = kde.pipe_.transform(numeric_train)
plt.hist(X_u[:, 1], bins=50)

In [None]:
pca_data = pd.DataFrame(X_p)
uniform_data = pd.DataFrame(X_u)

In [None]:
pca_data

In [None]:
import phik

pca_data.phik_matrix()

In [None]:
uniform_data.phik_matrix()

In [None]:
# sampling test

X_gen, sample_weight = kde.sample(50000)

In [None]:
X_gen

In [None]:
print(sum(sample_weight))
plt.hist(sample_weight, bins=50, log=True)

In [None]:
numeric_train

In [None]:
i=2
plt.figure(figsize=(12, 7))
plt.hist(X_gen[:, i], bins=100, range=(0,1), density=True, alpha=0.5, weights=sample_weight)
plt.hist(numeric_train.iloc[:, i], bins=50, range=(0,1), density=True, alpha=0.5)

In [None]:
i=0
j=1
plt.figure(figsize=(12,7))
plt.scatter(X_gen[:,i], X_gen[:,j], s=0.2*sample_weight)
plt.title("Generated samples")

In [None]:
plt.figure(figsize=(12,7))
plt.scatter(numeric_train.iloc[:,i], numeric_train.iloc[:,j], s=0.2)
plt.title("original data set values")

In [None]:
plt.hist(numeric_train['ps_car_12'], bins = 100, range = [0,1])

# Categorical features

In [None]:
# cardinality

for col in cat_train :
    unique = len(train[col].unique())
    print(f"var {col} has {unique} unique values")

In [None]:
cat_train = cat_train.sample(frac=0.1, random_state=1)
cat_train

In [None]:
est = TreeSearch(cat_train, root_node = cat_train.columns[0])

In [None]:
model = est.estimate(estimator_type = 'tan', class_node= 'ps_ind_02_cat')

In [None]:
nx.draw(model, with_labels=True, arrowsize=30, node_size=800, alpha=0.3, font_weight='bold')
plt.show()

In [None]:
edges = model.edges()
bayesian_model = BayesianModel(edges)
fit = bayesian_model.fit(cat_train, estimator = BayesianEstimator, 
                   prior_type = 'dirichlet', pseudo_counts = 0.1)

In [None]:
# print Conditional prob distribution of node x
print(bayesian_model.get_cpds('ps_ind_02_cat'))

In [None]:
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.svm import SVC

In [None]:
# clf=MLPClassifier(random_state=0, max_iter=1000, early_stopping=True)

clf = xgb.XGBClassifier(
    n_estimators=250,
    reg_lambda=1,
    gamma=0,
    max_depth=9
)

In [None]:
n_one = len(cat_train)
n_zero = n_one

In [None]:
from pgmpy.sampling import BayesianModelSampling

# sample data from BN
inference = BayesianModelSampling(bayesian_model)
df_data = inference.forward_sample(size=n_zero, return_type='dataframe', seed=0)


In [None]:
df_data.columns = [c for c in df_data.columns]

X = df_data[sorted(df_data.columns)].values

In [None]:
X.shape

In [None]:
zeros = np.zeros(n_zero)
ones = np.ones(n_one)

yy = np.concatenate([zeros, ones], axis = 0)
XX = np.concatenate([X, cat_train], axis = 0)

In [None]:
clf = clf.fit(XX,yy)

In [None]:
p0 = clf.predict_proba(X)[:, 1]
p2 = clf.predict_proba(cat_test)[:, 1]

In [None]:
nbins = 100
plt.figure(figsize=(12,7))
plt.hist(p0, bins=100, range=(0,1), alpha=0.5, log=True, density=True)
plt.hist(p2, bins=100, range=(0,1), alpha=0.5, log=True, density=True)