In [9]:
# Cell 1: Imports, hyperparams, load + synth data, normals, poly‐feature cache
import random, re, math
import numpy as np, pandas as pd
import sympy as sp, mpmath as mp
import tensorflow as tf
from functools import lru_cache
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import (
    Input, TextVectorization, Embedding, LSTM,
    Dense, Concatenate, Lambda
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TerminateOnNaN

# symbolic variable
x = sp.symbols('x')
# hyperparams
SEQ_LEN, BATCH, LR, EPOCHS, INT_TOL = 64, 64, 1e-4, 20, 1e-6

# — real CSV
df_real = pd.read_csv("functions2.csv", header=None,
    names=["function","lower","upper","true_raw"])
for c in ("lower","upper","true_raw"):
    df_real[c] = pd.to_numeric(df_real[c], errors="coerce")
df_real.dropna(subset=["lower","upper","true_raw"], inplace=True)
df_real.reset_index(drop=True, inplace=True)

# — synthetic polynomials
@lru_cache(None)
def make_random_poly(deg):
    coeffs = [random.uniform(-5,5) for _ in range(deg+1)]
    return sum(c*x**i for i,c in enumerate(coeffs))

@lru_cache(None)
def integrate_sympy(s,a,b):
    # support '^'→'**' and 'ln('→'log('
    expr = s.replace('^','**').replace('ln(','log(')
    e = sp.sympify(expr)
    return float(sp.integrate(e, (x,a,b)))

rows = []
for _ in range(5000):
    d = random.randint(1,5)
    poly = make_random_poly(d)
    a, b = random.uniform(-3,0), random.uniform(0,3)
    s = integrate_sympy(str(poly).replace('**','^'), a, b)
    rows.append({
        "function": str(poly).replace('**','^'),
        "lower":    a,
        "upper":    b,
        "true_raw": s
    })
df_synth = pd.DataFrame(rows)

# — combine + shuffle + normalize
df = pd.concat([df_real, df_synth], axis=0).sample(frac=1, random_state=42)
y_mean, y_std = df["true_raw"].mean(), df["true_raw"].std()
l_mean, l_std = df["lower"].mean(),    df["lower"].std()
u_mean, u_std = df["upper"].mean(),    df["upper"].std()

df["y_norm"]   = (df["true_raw"] - y_mean)/y_std
df["lower_n"]  = (df["lower"]    - l_mean)/l_std
df["upper_n"]  = (df["upper"]    - u_mean)/u_std

@lru_cache(None)
def poly_feats_cached(s):
    expr = s.replace('^','**').replace('ln(','log(')
    try:
        e = sp.sympify(expr)
        p = sp.Poly(e, x)
        return tuple(float(p.coeff_monomial(x**i)) for i in range(6))
    except:
        return (0.0,)*6

df["poly_feats"] = df["function"].apply(poly_feats_cached)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print("Total samples:", len(df))


2025-06-12 17:14:57.111659: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
  sqr = _ensure_numeric((avg - values) ** 2)


Total samples: 207252


In [10]:
# Cell 2: Build character‐level vectorizer + tf.data datasets
vectorizer = TextVectorization(
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    standardize=lambda s:
      tf.strings.regex_replace(
        tf.strings.regex_replace(tf.strings.lower(s),
                                 r"\^","**"),
                                 r"ln\(","log("),
    split="character"
)
vectorizer.adapt(train_df["function"].values)

def make_ds(ddf, shuffle=True):
    X = {
        "func_input":  ddf["function"].values,
        "lower_input": ddf["lower_n"].values.astype(np.float32).reshape(-1,1),
        "upper_input": ddf["upper_n"].values.astype(np.float32).reshape(-1,1),
        "poly_feats":  np.stack(ddf["poly_feats"].values).astype(np.float32),
    }
    y = ddf["y_norm"].values.astype(np.float32)
    ds = tf.data.Dataset.from_tensor_slices((X,y))
    if shuffle: ds = ds.shuffle(len(ddf))
    return ds.batch(BATCH).cache().prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(train_df, shuffle=True)
val_ds   = make_ds(val_df,   shuffle=False)


  "poly_feats":  np.stack(ddf["poly_feats"].values).astype(np.float32),


In [11]:
# Cell 3: Define 3‐branch model (chars→LSTM, bounds, poly‐coeffs)
f_in = Input(shape=(), dtype=tf.string, name="func_input")
toks = vectorizer(f_in)
toks = Lambda(lambda t: tf.cast(t,tf.int32))(toks)
c = Embedding(vectorizer.vocabulary_size(),32,mask_zero=True)(toks)
c = LSTM(32)(c)

l_in = Input((1,),dtype=tf.float32,name="lower_input")
u_in = Input((1,),dtype=tf.float32,name="upper_input")
b   = Concatenate()([l_in,u_in])
b   = Dense(16,activation="relu")(b)
b   = Dense(8, activation="relu")(b)

p_in = Input((6,),dtype=tf.float32,name="poly_feats")
p   = Dense(32,activation="relu")(p_in)
p   = Dense(16,activation="relu")(p)

m = Concatenate()([c,b,p])
m = Dense(64,activation="relu")(m)
m = Dense(32,activation="relu")(m)
out = Dense(1,activation="linear",name="pred")(m)

model = Model([f_in,l_in,u_in,p_in], out)
model.compile(Adam(LR,clipnorm=1.0), "huber", metrics=["mae"])
model.summary()


In [12]:
# Cell 4: Train & evaluate
model.fit(train_ds, validation_data=val_ds,
          epochs=EPOCHS, callbacks=[TerminateOnNaN()])
loss, mae = model.evaluate(val_ds)
print(f"Val Huber={loss:.4f}, MAE(norm)={mae:.4f}")


Epoch 1/20
Batch 0: Invalid loss, terminating training
[1m2591/2591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
[1m648/648[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: nan - mae: nan
Val Huber=nan, MAE(norm)=nan


In [13]:
# Cell 5: Safe inference → NN (dict), then Sympy/mpmath, then trapz
def safe_integral(func_str, a, b, tol=INT_TOL):
    expr_in = func_str.replace('^','**').replace('ln(','log(')
    ln_n = (a - l_mean)/l_std
    un_n = (b - u_mean)/u_std
    pf   = np.array(poly_feats_cached(func_str),dtype=np.float32)[None]

    # NN prediction
    inp = {
        "func_input":  np.array([func_str],dtype=object),
        "lower_input": np.array([[ln_n]],dtype=np.float32),
        "upper_input": np.array([[un_n]],dtype=np.float32),
        "poly_feats":  pf
    }
    r = model.predict(inp, verbose=0)[0,0]
    p = r*y_std + y_mean
    if not np.isnan(p) and abs(p - round(p)) < tol:
        return round(p)

    # exact Sympy
    try:
        e = sp.sympify(expr_in)
        return float(sp.integrate(e, (x,a,b)))
    except:
        pass

    # numeric mpmath
    try:
        e = sp.sympify(expr_in, convert_xor=True)
        fmp = sp.lambdify(x, e, 'mpmath')
        return float(mp.quad(fmp, [a,b]))
    except:
        pass

    # fallback trapezoid
    safe = re.sub(r'(?<=\d)(?=[A-Za-z\(])','*', expr_in)
    f = lambda v: eval(safe, {"x":v, **math.__dict__})
    xs = np.linspace(a,b,2000)
    ys = [f(v) for v in xs]
    return float(np.trapz(ys, xs))


In [16]:
# Cell 6: Interactive prompt (fractions in func, ln(), etc. all work)
def predict_interactive():
    f = input("Function (e.g. x^2/(x^3-1)+ln(x)): ")
    a = float(input("Lower bound: "))
    b = float(input("Upper bound: "))
    print("Result ≃", safe_integral(f,a,b))

predict_interactive()


Result ≃ 1.2958368660043291
