In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.DataFrame({
    'A': [1,2,2,3,4,5],
    'B': [2,4,4,6,8,10],
    'C': [1,1,1,1,1,1],
    'D': [0,0,0,0,0,0]
    
},index=['a','b','c','d','e','f'])

In [3]:
df.head()

Unnamed: 0,A,B,C,D
a,1,2,1,0
b,2,4,1,0
c,2,4,1,0
d,3,6,1,0
e,4,8,1,0


In [4]:
df.head()

Unnamed: 0,A,B,C,D
a,1,2,1,0
b,2,4,1,0
c,2,4,1,0
d,3,6,1,0
e,4,8,1,0


## Variance Threshold


In [5]:
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=0)


In [6]:
var_thres.fit(df)

VarianceThreshold(threshold=0)

In [7]:
var_thres.get_support()

array([ True,  True, False, False])

In [8]:
df.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [9]:
const_columns = [columns for columns in df.columns
                 if columns  not in df.columns[var_thres.get_support()]]

In [10]:
const_columns

['C', 'D']

In [11]:
df.drop(const_columns,axis=1,inplace=True)
df.head()

Unnamed: 0,A,B
a,1,2
b,2,4
c,2,4
d,3,6
e,4,8


## Now Dealing with big Data

In [12]:
df = pd.read_csv('./santander-customer-satisfaction/train.csv',nrows=10000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 371 entries, ID to TARGET
dtypes: float64(89), int64(282)
memory usage: 28.3 MB


In [13]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [14]:
df['TARGET'].value_counts()

0    9612
1     388
Name: TARGET, dtype: int64

### Fist we want to split this into Train and Test data

In [15]:
X = df.drop(['TARGET'],axis=1)
Y = df['TARGET']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3,random_state=0)

In [17]:
print(X_train.shape,Y_train.shape)

(7000, 370) (7000,)


In [18]:
## NOw apply Variance threshold

In [19]:
var_thresh = VarianceThreshold(threshold=0)

In [20]:
var_thresh.fit(X_train)

VarianceThreshold(threshold=0)

In [21]:
var_thresh.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [22]:
sum(var_thresh.get_support())

284

In [23]:
const_columns = [columns for columns in X_train.columns
                 if columns not in X_train.columns[var_thresh.get_support()]]

In [24]:
new_train = X_train.drop(const_columns,axis=1)

In [25]:
new_train.shape

(7000, 284)

In [26]:
X_train.shape

(7000, 370)

In [27]:
var_thresh

VarianceThreshold(threshold=0)