In [1]:
#importing the libraries
import pandas as pd
from sklearn.preprocessing import normalize

In [2]:
#reading the file
data = pd.read_csv('datasets\low_variance_filter.csv')

In [3]:
# first 5 rows of the data
data.head()

Unnamed: 0,ID,temp,atemp,humidity,windspeed,count
0,AB101,9.84,14.395,81,0.0,16
1,AB102,9.02,13.635,80,0.0,40
2,AB103,9.02,13.635,80,0.0,32
3,AB104,9.84,14.395,75,0.0,13
4,AB105,9.84,14.395,75,0.0,1


In [4]:
#shape of the data
data.shape

(12980, 6)

In [5]:
#percentage of missing values in each variable
data.isnull().sum()/len(data)*100

ID           0.0
temp         0.0
atemp        0.0
humidity     0.0
windspeed    0.0
count        0.0
dtype: float64

In [6]:
#data type of variables
data.dtypes

ID            object
temp         float64
atemp        float64
humidity       int64
windspeed    float64
count          int64
dtype: object

In [7]:
# display call exclude int64
#data.select_dtypes(exclude=['int64'])
data.select_dtypes(['int64'])


Unnamed: 0,humidity,count
0,81,16
1,80,40
2,80,32
3,75,13
4,75,1
...,...,...
12975,42,308
12976,53,236
12977,63,163
12978,63,48


In [8]:
#creating dummy variables of categorical variables
data = data.drop('ID', axis=1)

In [9]:
#shape of data
data.shape

(12980, 5)

In [10]:
data.var()

temp            61.291712
atemp           73.137484
humidity       398.549141
windspeed       69.322053
count        25843.419864
dtype: float64

In [11]:
normalize = normalize(data)

In [12]:
data_scaled = pd.DataFrame(normalize)

In [13]:
data_scaled

Unnamed: 0,0,1,2,3,4
0,0.116607,0.170585,0.959872,0.000000,0.189604
1,0.099203,0.149960,0.879850,0.000000,0.439925
2,0.102851,0.155473,0.912202,0.000000,0.364881
3,0.126009,0.184339,0.960431,0.000000,0.166475
4,0.127781,0.186932,0.973940,0.000000,0.012986
...,...,...,...,...,...
12975,0.119080,0.141100,0.132592,0.053662,0.972343
12976,0.142005,0.167801,0.213451,0.052348,0.950461
12977,0.183485,0.227392,0.343830,0.070938,0.889592
12978,0.348469,0.431855,0.652991,0.134723,0.497517


In [14]:
data_scaled.var()

0    0.005877
1    0.007977
2    0.093491
3    0.008756
4    0.111977
dtype: float64

In [15]:
#storing the variance and name of variables
variance = data_scaled.var()
columns = data.columns

In [16]:
#saving the names of variables having variance more than a threshold value
variable = [ ]
for i in range(0,len(variance)):
    if variance[i]>=0.006:   #setting the threshold as 0.006
        variable.append(columns[i])

In [17]:
variable

['atemp', 'humidity', 'windspeed', 'count']

In [18]:
# creating a new dataframe using the above variables
new_data = data[variable]

In [19]:
# first five rows of the new data
new_data.head()

Unnamed: 0,atemp,humidity,windspeed,count
0,14.395,81,0.0,16
1,13.635,80,0.0,40
2,13.635,80,0.0,32
3,14.395,75,0.0,13
4,14.395,75,0.0,1


In [20]:
#variance of variables in new data
new_data.var()

atemp           73.137484
humidity       398.549141
windspeed       69.322053
count        25843.419864
dtype: float64

In [21]:
# shape of new and original data
new_data.shape, data.shape

((12980, 4), (12980, 5))