## using scikit learn to prune features 
Generate some data, and do pruning against both a continuous and discrete binary target.



In [41]:
import numpy as np
from sklearn.feature_selection import f_regression, f_classif, mutual_info_regression
from sklearn.datasets import make_regression

In [54]:
from sklearn.feature_selection import SelectKBest

In [4]:
import polars as pl

### this is the function I wrote a while back before I realized that f_regression takes a numpy array of multiple features actually. 
Kept it to sanity check that they are equivalent

In [33]:
def evaluate_feature(df, feature, target):
    X = df[feature].to_numpy()
    num_rows = X.shape[0]
    X = np.reshape(X, (num_rows, 1))
    y = df[target].to_numpy()
    f_value, _ = f_regression(X, y)
    return f_value
    print(feature, f_value)


In [47]:

features = ["c1", "c2", "c3", "c4", "c5"]
X = np.random.random(size=(10000, 5))
df = pl.from_numpy(X, schema=features)
df = df.with_columns(
    (pl.col("c1") + pl.col("c2") + pl.col("c3")).alias("y"),
)
df = df.with_columns(
    ((pl.col("y") >= 2).cast(pl.Int8)).alias("target")
)
df.group_by("target").len()

target,len
i8,u32
0,8337
1,1663


In [48]:
[
    evaluate_feature(df, feature, 'y')
    for feature in ['y'] + features
]

[array([-3.75224909e+18]),
 array([4935.67950579]),
 array([4823.40828645]),
 array([4946.46681529]),
 array([0.88096144]),
 array([1.8142133])]

In [49]:
f_regression(df.select(features).to_numpy(), df["y"].to_numpy())

(array([4.93567951e+03, 4.82340829e+03, 4.94646682e+03, 8.80961436e-01,
        1.81421330e+00]),
 array([0.        , 0.        , 0.        , 0.34796116, 0.17803416]))

#### showing that f_classif with a continuous target indeed does not work
, since ANOVA is about variance between classes and yea there are no classes in a regression problem

In [52]:
f_classif(df.select(features).to_numpy(), df["y"].to_numpy())

  msw = sswn / float(dfwn)


(array([nan, nan, nan, nan, nan]), array([nan, nan, nan, nan, nan]))

In [50]:
f_classif(df.select(features).to_numpy(), df["target"].to_numpy())

(array([1.73154223e+03, 1.78842297e+03, 1.73917650e+03, 3.51022636e-01,
        2.75823180e-01]),
 array([0.        , 0.        , 0.        , 0.55354821, 0.59946366]))

#### however f_regression with a binary target is fine

In [53]:
f_regression(df.select(features).to_numpy(), df["target"].to_numpy())

(array([1.73154223e+03, 1.78842297e+03, 1.73917650e+03, 3.51022636e-01,
        2.75823180e-01]),
 array([0.        , 0.        , 0.        , 0.55354821, 0.59946366]))

## redo using the `dataset.make_regression` 

In [58]:
X, y = make_regression(
     n_samples=10000, n_features=5, n_informative=2, noise=1e-4, random_state=42
)

In [63]:
f_statistic, p_values = f_regression(X, y)
f_statistic

array([4.11657999e-02, 1.84012917e+08, 1.33793053e+00, 2.41241615e-01,
       2.75484116e+00])

In [64]:
f_statistic.tolist()

[0.041165799878519746,
 184012916.6745747,
 1.3379305304577502,
 0.24124161482034742,
 2.754841161408661]

In [65]:
pl.from_numpy(X)

column_0,column_1,column_2,column_3,column_4
f64,f64,f64,f64,f64
1.491614,0.50909,-0.678055,0.30336,-0.39035
0.291727,0.282634,-0.444699,-0.232362,-0.501386
-0.086795,-0.433946,0.90519,0.000887,-0.112914
0.110207,-0.053098,1.156007,0.028398,-0.867092
0.678377,1.035277,2.052395,-1.234422,-0.020884
…,…,…,…,…
2.710492,1.631534,-0.288728,0.772791,-1.604054
0.308692,-0.647586,0.424503,0.113557,-0.449453
-0.247086,1.247497,1.15459,-1.003477,1.132423
-0.959902,-0.03244,1.317566,-0.833291,0.152389
