In [1]:
import joblib
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

from IPython.display import display

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

import seaborn as sns



import sympy as sym
from sympy import Symbol, sympify, lambdify, abc, SympifyError

from gplearn.genetic import SymbolicClassifier
from gplearn.genetic import SymbolicRegressor
from sympy import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
import types
import random

import graphviz

In [2]:
joblib.__version__

'1.0.1'

In [3]:
np.random.seed(42)
random.seed(42)

## Load Data

In [4]:
#path = "./data/replica_pump_data.csv"
path = "./data/replica_pump_data.csv"
pump_data_replica = pd.read_csv(path)
print(pump_data_replica.shape)

(26381042, 5)


In [5]:
pump_data_replica.head()

Unnamed: 0,energy_norm_log,temperature_diff,rms_norm_log,details_ratedhead,state
0,-4.642337,-0.585072,-2.831278,47.369469,1
1,-4.38441,-2.051363,-2.900545,120.240341,1
2,-5.047895,2.10473,-2.74272,92.577971,1
3,-4.962318,0.375291,-2.975236,75.714544,1
4,-5.08234,-1.878716,-2.900094,19.732252,1


## Load Random Forest Model

In [6]:
with open("./data/randForestBest_20201002.pkl", 'rb') as f:
    random_forest_model = joblib.load(f)  



## Logistic Regression

### Sampled Data

#### basic

In [7]:

data = pump_data_replica.sample(n=100_000)

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

In [8]:
y_train

array([0, 0, 0, ..., 0, 1, 1])

In [9]:
y_train_rf_pred = random_forest_model.predict(X_train)
y_test_rf_pred = random_forest_model.predict(X_test)

In [10]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train_rf_pred)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9265588386580558
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9486
Performance (F1 Score) Logistic Regression: 0.8877902455282746
Performance (Accuracy) Logistic Regression: 0.9192


1/(1.4549914146182*exp(-1.657*x_0 - 0.096*x_1 - 0.319*x_2 + 0.003*x_3) + 1)

In [11]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)


coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9267666908360755
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.94748
Performance (F1 Score) Logistic Regression: 0.8922642942389063
Performance (Accuracy) Logistic Regression: 0.92056


1/(0.94082323977601*exp(-0.943*x_0 - 0.06*x_1 - 0.785*x_2 + 0.002*x_3) + 1)

#### polynomial features

In [12]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.943707839032333
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.96128
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.87362295915485
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.91052


1/(0.9152464017738899*exp(0.026922765081768933*x0**2 - 0.2130094423483082*x0*x1 + 0.024876353235428837*x0*x2 - 0.0036654543302487596*x0*x3 - 0.7396430183121271*x0 + 0.0031687769147276785*x1**2 + 0.193984304485567*x1*x2 + 2.3089260003405987e-5*x1*x3 - 0.29329118059497566*x1 + 0.017025754142234215*x2**2 - 0.011020832020520174*x2*x3 - 0.29940115591448*x2 - 8.079021373281422e-6*x3**2 + 0.020310332824683364*x3) + 1)

In [13]:
X = X_test_poly[:5]
function_vars = [sym.symbols(variable_identifier) for variable_identifier in variable_identifier_list]
function_values = []
for data_point in X:
    function_value = logistic_regression_function_sympy.evalf(subs={var: data_point[index] for index, var in enumerate(list(function_vars))})
    try:
        function_value = float(function_value)
    except TypeError as te:
        function_value = np.inf
    function_values.append(function_value)
Y_est = function_values#np.nan_to_num(function_values).ravel()
print(Y_est)
print(clf.predict_proba(X)[:,1:])

[0.0002932160455848126, 0.9995043799254669, 0.9996959248743833, 0.963067511052141, 0.01301469864938835]
[[2.93216046e-04]
 [9.99504380e-01]
 [9.99695925e-01]
 [9.63067511e-01]
 [1.30146986e-02]]


In [14]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)


preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9279344153326317
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.94796
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8892121995904732
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.91776


1/(1.000068759320953*exp(0.0419301935483896*x0**2 - 0.02684067101921403*x0*x1 + 0.06604972870756479*x0*x2 - 0.0032924400027167064*x0*x3 - 0.3094754918818906*x0 - 0.0006565481410156127*x1**2 - 0.0018161321770334706*x1*x2 + 2.8469081546370982e-5*x1*x3 - 0.07560405537628562*x1 + 0.037784249444640414*x2**2 - 0.005633470955109121*x2*x3 - 0.1462251071588629*x2 + 6.4768525473217545e-6*x3**2 + 0.0002009339712090635*x3) + 1)

#### function

In [15]:
preds = np.clip(np.round(10*X_test[:,0]), 0, 1)

f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) 2*X0 - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) 2*X0 - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) 2*X0:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) 2*X0:', acc_performance)

Fidelity (F1 Score) 2*X0 - Random Forest Model: 0.9130008648025367
Fidelity (Accuracy) 2*X0 - Random Forest Model: 0.93964
Performance (F1 Score) 2*X0: 0.8815686274509804
Performance (Accuracy) 2*X0: 0.91544


#### random forest

In [16]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.90676


### Complete Data

In [17]:
data = pump_data_replica

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

In [18]:
y_train_rf_pred = random_forest_model.predict(X_train)
y_test_rf_pred = random_forest_model.predict(X_test)

#### basic

In [19]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train_rf_pred)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9257077296584201
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9488276506418776
Performance (F1 Score) Logistic Regression: 0.887081469837777
Performance (Accuracy) Logistic Regression: 0.9199218954337061


1/(1.43619894196035*exp(-1.684*x_0 - 0.093*x_1 - 0.31*x_2 + 0.003*x_3) + 1)

In [20]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)


Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9271903410829246
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9485808067338047
Performance (F1 Score) Logistic Regression: 0.8923695003720781
Performance (Accuracy) Logistic Regression: 0.9217974845877972


1/(0.946485147953484*exp(-0.961*x_0 - 0.059*x_1 - 0.744*x_2 + 0.002*x_3) + 1)

#### polynomial features

In [21]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model: 0.937180946521965
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9573915270373682
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8652301383175963
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.9058440598484275


1/(0.9676585585857316*exp(0.04920466398198076*x0**2 - 0.14650829689050376*x0*x1 + 0.03075354993556312*x0*x2 - 0.012250895657378622*x0*x3 - 0.2798706230230561*x0 + 0.004129523845031069*x1**2 + 0.03391475955993701*x1*x2 - 0.00010103508249413183*x1*x3 - 0.3440832982965765*x1 + 0.014146010468240476*x2**2 - 0.00932836059388275*x2*x3 - 0.11713812652494542*x2 + 4.1143954734075075e-6*x3**2 + 0.02053969476926292*x3) + 1)

In [22]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9308662331580162
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9514299737341706
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8765836012912316
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.9107797250177059


1/(0.9978786654516176*exp(0.031126070961043265*x0**2 - 0.042134708405514636*x0*x1 + 0.05754156690915437*x0*x2 - 0.004286244036417964*x0*x3 - 0.2580978683904601*x0 + 0.001354823866066021*x1**2 - 0.00973931318884627*x1*x2 + 2.9129363259691755e-5*x1*x3 - 0.169384462874112*x1 + 0.03340024202343735*x2**2 - 0.005025505749609331*x2*x3 - 0.12015782467664578*x2 + 4.103052597698609e-6*x3**2 + 0.004444205741733472*x3) + 1)

In [23]:
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model: 0.8709441310526813
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9138614832680617
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8084033068825933
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.8682158295175885


1/(1.0000004897659*exp(-9.205540033175983e-5*x0**3 + 1.420859910694102e-6*x0**2*x1 - 4.137046067683137e-5*x0**2*x2 + 0.00031076902749998647*x0**2*x3 + 8.12800944527168e-6*x0**2 - 0.0007543997074441295*x0*x1**2 + 8.413160680350168e-6*x0*x1*x2 - 0.001768136631892626*x0*x1*x3 - 4.445456941335716e-5*x0*x1 - 2.0689112186820324e-5*x0*x2**2 + 0.00016718870452897094*x0*x2*x3 + 4.672875786443072e-6*x0*x2 - 6.525368821809038e-5*x0*x3**2 - 0.00023271613086312638*x0*x3 - 7.408224784279621e-6*x0 - 9.64884806887795e-5*x1**3 - 0.0003155982421590985*x1**2*x2 + 5.84274507411005e-5*x1**2*x3 - 7.388339143758112e-5*x1**2 + 4.560371895704769e-6*x1*x2**2 - 0.0007578482496253008*x1*x2*x3 - 1.913642693477143e-5*x1*x2 - 5.427017633237873e-6*x1*x3**2 - 0.0004180076560564835*x1*x3 - 5.629077887521126e-6*x1 - 1.0964997115305976e-5*x2**3 + 9.360818542735503e-5*x2**2*x3 + 2.5563398685612205e-6*x2**2 - 4.8125019787920865e-5*x2*x3**2 - 0.00010318951513777226*x2*x3 - 3.3577264564221742e-6*x2 + 6.555932521392413e-8*x3*

In [24]:
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.874729618099276
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9194773338007397
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8053099024513487
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.8708892642762736


1/(1.0000016106431354*exp(-0.0002274839421384301*x0**3 + 1.017661883581187e-5*x0**2*x1 - 0.00010432914171845161*x0**2*x2 + 0.0009355966093813967*x0**2*x3 + 2.0825992943575626e-5*x0**2 - 0.0008699754312646064*x0*x1**2 + 1.8744244188375736e-5*x0*x1*x2 - 0.0005229717180125711*x0*x1*x3 - 5.1416383711300855e-5*x0*x1 - 5.2400464705139407e-5*x0*x2**2 + 0.0005250368543701037*x0*x2*x3 + 1.2080469821845088e-5*x0*x2 - 1.641199567315536e-5*x0*x3**2 - 0.0007047405800217966*x0*x3 - 1.7463009358849987e-5*x0 - 2.4292854561875446e-5*x1**3 - 0.00041963359923172965*x1**2*x2 - 2.2358829334282765e-5*x1**2*x3 - 6.604770082801392e-5*x1**2 + 1.1633686989751384e-5*x1*x2**2 - 0.00027678053043936503*x1*x2*x3 - 2.398127764985474e-5*x1*x2 + 9.0172298571421e-7*x1*x3**2 - 0.0003436282390803636*x1*x3 - 5.417503235470609e-6*x1 - 2.8066131040881334e-5*x2**3 + 0.00027558765781202495*x2**2*x3 + 6.490263843822449e-6*x2**2 - 1.3972110406408409e-5*x2*x3**2 - 0.00033603349921119307*x2*x3 - 8.352676775437502e-6*x2 + 1.8779616

#### function

In [25]:
preds = np.clip(np.round(10*X_test[:,0]), 0, 1)

f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) 2*X0 - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) 2*X0 - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) 2*X0:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) 2*X0:', acc_performance)

Fidelity (F1 Score) 2*X0 - Random Forest Model: 0.9126073764149031
Fidelity (Accuracy) 2*X0 - Random Forest Model: 0.9402887922100429
Performance (F1 Score) 2*X0: 0.8796552490988172
Performance (Accuracy) 2*X0: 0.9153228355936179


#### random forest

In [26]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.9078285150504278
