# Identifying "sparse columns" --- columns without predictive information

We use the following data set
https://www.kaggle.com/datasets/ashrafkhan94/oil-spill

In [67]:
import pandas as pd
df = pd.read_csv('wildfires_clean.csv')

In [68]:
df.head(2)

Unnamed: 0,fire_size,fire_name,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,cont_clean_date,discovery_month,...,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness,cause_code,state_code,fire_size_class_code
0,10.0,,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,,2,...,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923,8,38,1
1,3.0,,B,Arson,35.03833,-87.61,TN,12/11/2006,,12,...,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355,0,42,0


In [69]:
df.columns

Index(['fire_size', 'fire_name', 'fire_size_class', 'stat_cause_descr',
       'latitude', 'longitude', 'state', 'disc_clean_date', 'cont_clean_date',
       'discovery_month', 'disc_date_final', 'cont_date_final', 'putout_time',
       'disc_date_pre', 'disc_pre_year', 'disc_pre_month', 'wstation_usaf',
       'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear',
       'Vegetation', 'fire_mag', 'weather_file', 'Temp_pre_30', 'Temp_pre_15',
       'Temp_pre_7', 'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7',
       'Wind_cont', 'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont',
       'Prec_pre_30', 'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness',
       'cause_code', 'state_code', 'fire_size_class_code'],
      dtype='object')

In [70]:
df.shape

(55367, 44)

### Count unique values in each column

In [71]:
df.nunique(axis=0)#.head(10)

fire_size                6092
fire_name               21793
fire_size_class             6
stat_cause_descr           13
latitude                46329
longitude               49044
state                      51
disc_clean_date          8114
cont_clean_date          7078
discovery_month            12
disc_date_final         27835
cont_date_final         24011
putout_time               297
disc_date_pre            8114
disc_pre_year              25
disc_pre_month             12
wstation_usaf            2220
dstation_m              54009
wstation_wban            1570
wstation_byear             82
wstation_eyear             28
Vegetation                  7
fire_mag                    6
weather_file            11314
Temp_pre_30             35523
Temp_pre_15             34038
Temp_pre_7              32670
Temp_cont               22414
Wind_pre_30             35028
Wind_pre_15             33381
Wind_pre_7              31247
Wind_cont               21918
Hum_pre_30              34352
Hum_pre_15

In [72]:
df.columns[df.nunique() == 1]

Index([], dtype='object')

## Calculate the percentage of unique values in each column

In [73]:
df.nunique()/len(df)*100

fire_size               11.002944
fire_name               39.360991
fire_size_class          0.010837
stat_cause_descr         0.023480
latitude                83.676197
longitude               88.579840
state                    0.092113
disc_clean_date         14.654939
cont_clean_date         12.783788
discovery_month          0.021674
disc_date_final         50.273629
cont_date_final         43.366988
putout_time              0.536421
disc_date_pre           14.654939
disc_pre_year            0.045153
disc_pre_month           0.021674
wstation_usaf            4.009609
dstation_m              97.547275
wstation_wban            2.835624
wstation_byear           0.148103
wstation_eyear           0.050572
Vegetation               0.012643
fire_mag                 0.010837
weather_file            20.434555
Temp_pre_30             64.159156
Temp_pre_15             61.477053
Temp_pre_7              59.006267
Temp_cont               40.482598
Wind_pre_30             63.265122
Wind_pre_15   

## Columns with less than 1% unique values

In [74]:
list(df.columns[df.nunique()/len(df) < 0.01])

['fire_size_class',
 'stat_cause_descr',
 'state',
 'discovery_month',
 'putout_time',
 'disc_pre_year',
 'disc_pre_month',
 'wstation_byear',
 'wstation_eyear',
 'Vegetation',
 'fire_mag',
 'cause_code',
 'state_code',
 'fire_size_class_code']

## Using Variance Threshold

$s^2 = \frac{\sum_1^N(x_i - \mu)}{N-1}$

In [75]:
df.head()

Unnamed: 0,fire_size,fire_name,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,cont_clean_date,discovery_month,...,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness,cause_code,state_code,fire_size_class_code
0,10.0,,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,,2,...,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923,8,38,1
1,3.0,,B,Arson,35.03833,-87.61,TN,12/11/2006,,12,...,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355,0,42,0
2,60.0,,C,Arson,34.9478,-88.7225,MS,2/29/2004,,2,...,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544,0,24,1
3,1.0,WNA 1,B,Debris Burning,39.6414,-119.3083,NV,6/6/2005,6/6/2005,6,...,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447,3,32,0
4,2.0,,B,Miscellaneous,30.7006,-90.5914,LA,9/22/1999,,9,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633,7,17,0


In [76]:
df2 = df[['cause_code', 'state_code', 'fire_size_class_code', 'Vegetation',
       'fire_mag', 'Temp_pre_30', 'Temp_pre_15', 'Temp_pre_7',
       'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7', 'Wind_cont',
       'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont', 'Prec_pre_30',
       'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness']]

In [46]:
# # y = df.iloc[:,1: ]
# y = df.iloc[:, 1:]
# y

In [77]:
from sklearn.feature_selection import VarianceThreshold

data = df2.values

X = data[:, 1:] #everything except the first column
y = data[:, 0] #first column

print(X.shape, y.shape)

transform = VarianceThreshold()

X_sel = transform.fit_transform(X)
print(X_sel.shape)

(55367, 21) (55367,)
(55367, 21)


In [78]:
df2.columns[:-1][~transform.get_support()]

Index([], dtype='object')

In [79]:
transform = VarianceThreshold(threshold=0.05)
transform.fit(X)

VarianceThreshold(threshold=0.05)

In [80]:
[i for i, b in enumerate(transform.get_support()) if not b]

[20]

In [81]:
df.columns[:-1][~transform.get_support()]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 43 but corresponding boolean dimension is 21

In [82]:
from numpy import arange
import altair as alt

# df = pd.read_csv('oil-spill.csv', header=None)

# data = df.values
# X = data[:, :-1]
# y = data[:, -1]

# print(X.shape, y.shape)

thresholds = arange(0.0, 0.55, 0.05)

results = []
for t in thresholds:
    
    transform = VarianceThreshold(threshold=t)
    
    X_sel = transform.fit_transform(X)
    rows, cols = X_sel.shape
    n_features = cols
    print('Threshold=%.2f, Features=%d' % (t, n_features))
    
    results.append(n_features)
    
d2 = pd.DataFrame({'threshold': thresholds, 'n_features': results})
alt.Chart(d2).mark_line().encode(
    x='threshold',
    y='n_features')

Threshold=0.00, Features=21
Threshold=0.05, Features=20
Threshold=0.10, Features=20
Threshold=0.15, Features=20
Threshold=0.20, Features=20
Threshold=0.25, Features=20
Threshold=0.30, Features=20
Threshold=0.35, Features=20
Threshold=0.40, Features=20
Threshold=0.45, Features=20
Threshold=0.50, Features=20


  for col_name, dtype in df.dtypes.iteritems():
