In [1]:
import joblib
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

from IPython.display import display

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

import seaborn as sns



import sympy as sym
from sympy import Symbol, sympify, lambdify, abc, SympifyError

from gplearn.genetic import SymbolicClassifier
from gplearn.genetic import SymbolicRegressor
from sympy import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
import types
import random

import graphviz

In [2]:
joblib.__version__

'1.0.1'

In [3]:
np.random.seed(42)
random.seed(42)

## Load Data

In [4]:
#path = "./data/replica_pump_data.csv"
path = "./data/replica_pump_data.csv"
pump_data_replica = pd.read_csv(path)
print(pump_data_replica.shape)

(26381042, 5)


In [5]:
pump_data_replica.head()

Unnamed: 0,energy_norm_log,temperature_diff,rms_norm_log,details_ratedhead,state
0,-4.642337,-0.585072,-2.831278,47.369469,1
1,-4.38441,-2.051363,-2.900545,120.240341,1
2,-5.047895,2.10473,-2.74272,92.577971,1
3,-4.962318,0.375291,-2.975236,75.714544,1
4,-5.08234,-1.878716,-2.900094,19.732252,1


## Load Random Forest Model

In [6]:
with open("./data/randForestBest_20201002.pkl", 'rb') as f:
    random_forest_model = joblib.load(f)  



## Logistic Regression

### Sampled Data

#### basic

In [7]:

data = pump_data_replica.sample(n=100_000)

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

In [8]:
y_train

array([0, 0, 0, ..., 0, 1, 1])

In [9]:
y_train_rf_pred = random_forest_model.predict(X_train)
y_test_rf_pred = random_forest_model.predict(X_test)

In [10]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train_rf_pred)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9265588386580558
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9486
Performance (F1 Score) Logistic Regression: 0.8877902455282746
Performance (Accuracy) Logistic Regression: 0.9192


1/(1.4549914146182*exp(-1.657*x_0 - 0.096*x_1 - 0.319*x_2 + 0.003*x_3) + 1)

In [11]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)


coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9267666908360755
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.94748
Performance (F1 Score) Logistic Regression: 0.8922642942389063
Performance (Accuracy) Logistic Regression: 0.92056


1/(0.94082323977601*exp(-0.943*x_0 - 0.06*x_1 - 0.785*x_2 + 0.002*x_3) + 1)

#### polynomial features

In [12]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.939836448598131
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9588
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8650212765957447
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.90484


1/(0.9659024892452506*exp(-0.012386654575196278*x0**2 - 0.15407804247971868*x0*x1 + 0.0035209866203692675*x0*x2 - 0.011509519787982037*x0*x3 - 0.24148277446681493*x0 + 0.003513467481004637*x1**2 + 0.03969601071605454*x1*x2 - 0.00017725754260900685*x1*x3 - 0.30377841117305615*x1 + 0.0015257266999313462*x2**2 - 0.00996244339743366*x2*x3 - 0.09771645122257876*x2 - 1.8927771694089615e-6*x3**2 + 0.02352325108234961*x3) + 1)

In [13]:
X = X_test_poly[:5]
function_vars = [sym.symbols(variable_identifier) for variable_identifier in variable_identifier_list]
function_values = []
for data_point in X:
    function_value = logistic_regression_function_sympy.evalf(subs={var: data_point[index] for index, var in enumerate(list(function_vars))})
    try:
        function_value = float(function_value)
    except TypeError as te:
        function_value = np.inf
    function_values.append(function_value)
Y_est = function_values#np.nan_to_num(function_values).ravel()
print(Y_est)
print(clf.predict_proba(X)[:,1:])

[0.0002263698898328147, 0.9992708041271765, 0.9998493483075224, 0.9944529589541462, 0.8465446861688402]
[[2.26369890e-04]
 [9.99270804e-01]
 [9.99849348e-01]
 [9.94452959e-01]
 [8.46544686e-01]]


In [14]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)


preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9329530839344629
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.95204
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8778551229062432
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.91016


1/(0.99756791936098441*exp(0.026657839314309335*x0**2 - 0.03854969575786857*x0*x1 + 0.055926191845977774*x0*x2 - 0.004375097256243473*x0*x3 - 0.26155658418757943*x0 + 0.0014470980602936023*x1**2 - 0.009920175805347292*x1*x2 + 3.5558591566098e-5*x1*x3 - 0.16893588642723195*x1 + 0.03268949251072916*x2**2 - 0.005557611808121668*x2*x3 - 0.12288380769309638*x2 + 4.999007259803651e-6*x3**2 + 0.004436618616761277*x3) + 1)

#### function

In [15]:
preds = np.clip(np.round(10*X_test[:,0]), 0, 1)

f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) 2*X0 - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) 2*X0 - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) 2*X0:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) 2*X0:', acc_performance)

Fidelity (F1 Score) 2*X0 - Random Forest Model: 0.9130008648025367
Fidelity (Accuracy) 2*X0 - Random Forest Model: 0.93964
Performance (F1 Score) 2*X0: 0.8815686274509804
Performance (Accuracy) 2*X0: 0.91544


#### random forest

In [16]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.90676


### Complete Data

In [17]:
data = pump_data_replica

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

In [18]:
y_train_rf_pred = random_forest_model.predict(X_train)
y_test_rf_pred = random_forest_model.predict(X_test)

#### basic

In [19]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train_rf_pred)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9257077296584201
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9488276506418776
Performance (F1 Score) Logistic Regression: 0.887081469837777
Performance (Accuracy) Logistic Regression: 0.9199218954337061


1/(1.43619894196035*exp(-1.684*x_0 - 0.093*x_1 - 0.31*x_2 + 0.003*x_3) + 1)

In [20]:
clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train, y_train)

preds = clf.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regression - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regression:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regression:', acc_performance)

coef = ' + '.join([str(np.round(coefficient, 3)) + '*x_' + str(i) for i, coefficient in enumerate(clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 3))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fidelity (F1 Score) Logistic Regression - Random Forest Model: 0.9271903410829246
Fidelity (Accuracy) Logistic Regression - Random Forest Model: 0.9485808067338047
Performance (F1 Score) Logistic Regression: 0.8923695003720781
Performance (Accuracy) Logistic Regression: 0.9217974845877972


1/(0.946485147953484*exp(-0.961*x_0 - 0.059*x_1 - 0.744*x_2 + 0.002*x_3) + 1)

#### polynomial features

In [21]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model: 0.9390044399747365
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9588381718327751
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8700849414247459
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.9096827252173947


1/(0.9302691313554123*exp(0.05725573523869259*x0**2 - 0.24762088466722346*x0*x1 + 0.03473811553535881*x0*x2 - 0.005397006942481776*x0*x3 - 0.7124400215706337*x0 + 0.0014154508585449606*x1**2 + 0.0976229795617102*x1*x2 + 0.0001033927449932151*x1*x3 - 0.29327694836096946*x1 + 0.021300181038441127*x2**2 - 0.011031777120725459*x2*x3 - 0.29874458127519593*x2 - 3.7756524358565736e-6*x3**2 + 0.019400874359348078*x3) + 1)

In [22]:
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9309141365696707
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9514580241782699
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8766664443692268
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.9108296093209958


1/(0.99776471015717396*exp(0.0313915099696629*x0**2 - 0.04371246113089017*x0*x1 + 0.05865400782941809*x0*x2 - 0.004098624641049362*x0*x3 - 0.26540609224316764*x0 + 0.0013634931461093595*x1**2 - 0.009314100528232337*x1*x2 + 3.237221315525861e-5*x1*x3 - 0.1701177507397923*x1 + 0.03401984692395202*x2**2 - 0.005191755446501386*x2*x3 - 0.12348228993584681*x2 + 3.994845900842643e-6*x3**2 + 0.004342889365317119*x3) + 1)

In [23]:
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train_rf_pred)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

Fidelity (F1 Score) Logistic Regression Polynomial Features - Random Forest Model: 0.8714735808231988
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9142658645351563
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.8089378746906527
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.8686593297823998


1/(1.000000492225927*exp(-9.23633471934513e-5*x0**3 + 1.5807543945835542e-6*x0**2*x1 - 4.1509423347920933e-5*x0**2*x2 + 0.00031164575813618083*x0**2*x3 + 8.17100393905537e-6*x0**2 - 0.0007574033601676178*x0*x1**2 + 8.527169147573101e-6*x0*x1*x2 - 0.0017782365078487657*x0*x1*x3 - 4.480970919177078e-5*x0*x1 - 2.0759828869959416e-5*x0*x2**2 + 0.00016750043799227017*x0*x2*x3 + 4.696894371656872e-6*x0*x2 - 6.834432525005995e-5*x0*x3**2 - 0.00023275120647287887*x0*x3 - 7.437724024261284e-6*x0 - 0.00010074156231792606*x1**3 - 0.0003168964679618513*x1**2*x2 + 5.901933145298026e-5*x1**2*x3 - 7.458389836440907e-5*x1**2 + 4.613587027222967e-6*x1*x2**2 - 0.0007624054813891951*x1*x2*x3 - 1.929782612391575e-5*x1*x2 - 5.482782994495632e-6*x1*x3**2 - 0.0004194554851889597*x1*x3 - 5.649875320461432e-6*x1 - 1.1002938836571546e-5*x2**3 + 9.383210063035078e-5*x2**2*x3 + 2.5697129875376507e-6*x2**2 - 4.491557266570163e-5*x2*x3**2 - 0.00010315460640368964*x2*x3 - 3.3709719295767436e-6*x2 + 6.721416031182079

In [24]:
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

clf = LogisticRegression(max_iter=1000,
                         penalty='l2', 
                         tol=0.0001, 
                         C=1.0, 
                         random_state=0,
                         n_jobs=-1).fit(X_train_poly, y_train)

preds = clf.predict(X_test_poly)
f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) Logistic Regressionn Polynomial Features:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) Logistic Regressionn Polynomial Features:', acc_performance)

variable_identifier_list = [variable_identifier.replace(' ', '*') for variable_identifier in poly.get_feature_names()]
coef = ' + '.join([str(np.round(coefficient, 300)) + '*' + variable_identifier for variable_identifier, coefficient in zip(variable_identifier_list, clf.coef_[0])])
intercept = str(np.round(clf.intercept_[0], 300))
coef_intercept = coef + ' + ' + intercept

logistic_regression_function = '1/(1+exp(-(' + coef_intercept + ')))'
logistic_regression_function_sympy = sympify(logistic_regression_function)

logistic_regression_function_sympy

  return f(*args, **kwargs)


Fidelity (F1 Score) Logistic Regressionn Polynomial Features - Random Forest Model: 0.8583692683995165
Fidelity (Accuracy) Logistic Regressionn Polynomial Features - Random Forest Model: 0.9033190650074349
Performance (F1 Score) Logistic Regressionn Polynomial Features: 0.7998877587840507
Performance (Accuracy) Logistic Regressionn Polynomial Features: 0.8593218676258605


1/(1.000000239899468*exp(-3.987240290103183e-5*x0**3 + 9.038463986333095e-7*x0**2*x1 - 1.8171759889124487e-5*x0**2*x2 + 0.00016588439333354777*x0**2*x3 + 3.4479068946158208e-6*x0**2 - 0.0002999692246696982*x0*x1**2 + 4.182136565111579e-6*x0*x1*x2 - 0.0006591348357414004*x0*x1*x3 - 1.7371131296959287e-5*x0*x1 - 9.08175037821819e-6*x0*x2**2 + 9.441691202991266e-5*x0*x2*x3 + 2.068431533372318e-6*x0*x2 - 2.0795985919481787e-5*x0*x3**2 - 0.00012005588600760279*x0*x3 - 3.0674408451190046e-6*x0 - 1.2823992603799582e-5*x1**3 - 0.0001363484155795731*x1**2*x2 - 2.8992809356304144e-5*x1**2*x3 - 1.9106064788095217e-5*x1**2 + 2.7257227168321305e-6*x1*x2**2 - 0.0002944331200464126*x1*x2*x3 - 7.890795379721237e-6*x1*x2 + 3.5603366204827266e-7*x1*x3**2 - 5.0188410445643023e-5*x1*x3 - 9.432858392023209e-7*x1 - 4.837304319938414e-6*x2**3 + 4.976549666060826e-5*x2**2*x3 + 1.125358348925249e-6*x2**2 - 1.4858812265037941e-5*x2*x3**2 - 5.654044164350087e-5*x2*x3 - 1.455192818426714e-6*x2 + 1.330620528920673

#### function

In [25]:
preds = np.clip(np.round(10*X_test[:,0]), 0, 1)

f1_fidelity = f1_score(y_test_rf_pred, preds)
print('Fidelity (F1 Score) 2*X0 - Random Forest Model:', f1_fidelity)
acc_fidelity = accuracy_score(y_test_rf_pred, preds)
print('Fidelity (Accuracy) 2*X0 - Random Forest Model:', acc_fidelity)

f1_performance = f1_score(y_test, preds)
print('Performance (F1 Score) 2*X0:', f1_performance)
acc_performance = accuracy_score(y_test, preds)
print('Performance (Accuracy) 2*X0:', acc_performance)

Fidelity (F1 Score) 2*X0 - Random Forest Model: 0.9126073764149031
Fidelity (Accuracy) 2*X0 - Random Forest Model: 0.9402887922100429
Performance (F1 Score) 2*X0: 0.8796552490988172
Performance (Accuracy) 2*X0: 0.9153228355936179


#### random forest

In [26]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.9078285150504278
