# VarianceThreshold

* We can use this as part of a dimensionality reduction strategy to remove low variance columns

In [4]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [29]:
df = pd.DataFrame({
    "low_variance": [0.99, 1, 1.01],
    "medium_variance": [3, 4, 5],
    "high_variance": [0, 10, 20]
})

display(df)

# VarianceThreshold measures variance with 0 degrees of freedom, not 1
display(df.var(ddof=0).to_frame("variance"))

# The threshold is the minimum variance
# Any columns with a lower variance will be filtered out below

# Going higher than 66.66 would throw an error since none of the columns have a variance that exeeds that
for threshold in [0, 0.5, 5, 50]:
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(df)

    mask = sel.get_support()
    reduced_df = df.loc[:, mask]

    print("\nThreshold:", threshold)
    display(reduced_df)

Unnamed: 0,low_variance,medium_variance,high_variance
0,0.99,3,0
1,1.0,4,10
2,1.01,5,20


Unnamed: 0,variance
low_variance,6.7e-05
medium_variance,0.666667
high_variance,66.666667



Threshold: 0


Unnamed: 0,low_variance,medium_variance,high_variance
0,0.99,3,0
1,1.0,4,10
2,1.01,5,20



Threshold: 0.5


Unnamed: 0,medium_variance,high_variance
0,3,0
1,4,10
2,5,20



Threshold: 5


Unnamed: 0,high_variance
0,0
1,10
2,20



Threshold: 50


Unnamed: 0,high_variance
0,0
1,10
2,20
