### Feature selection using Filter and wrapper meathod based on correlation between the features and choosing top 1% for training and testing

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import RFECV
import math

In [2]:
#Load the train data
filename = "blogData_train.csv"
train_data = pd.read_csv(filename,header=None)

In [3]:
#Using Pearson Correlation
cor = train_data.corr()
#Correlation with output variable
cor_target = abs(cor[280])

In [4]:
#Selecting highly correlated features
relevant_features = cor_target.sort_values(ascending=False)
n_features = 0.1*len(relevant_features)
features = relevant_features.index
features = features[1:]
print(features)

Int64Index([  9,  20,   5,   4,  10,  14,  19,   0,  51,  15,
            ...
            230, 181,  90, 141, 216, 237,  12,  32,  37, 277],
           dtype='int64', length=280)


In [5]:
#Remove features that are highly correlated with each other
selected = []
for i in features:
    temp = i
    for j in range(len(features)):
        temp = temp+1
        if temp > 279:
            continue
        next = features[temp]
        if cor.iloc[i,next]>0.3:
            continue
    selected.append(i)
selected = selected[:28]

In [6]:
#Load the output for the train data
train_output = train_data[len(train_data.columns)-1]
del train_data[len(train_data.columns)-1]
#Select only the selected features in train data
train_data = train_data.iloc[:, selected]
train_data = pd.DataFrame(data=train_data)

In [7]:
# Load the test data
filename = "blogData_test-2012.02.01.00_00.csv"
test_data = pd.read_csv(filename,header=None)
test_output = test_data[len(test_data.columns)-1]
test_data = test_data.iloc[:, selected]

In [8]:
# Initialize the models
reg = LinearRegression()
rf = RandomForestRegressor()
gradBoost = GradientBoostingRegressor()
ada = AdaBoostRegressor()
#n_estimators=300 add to the regressors to observe the change
regressors = [reg,rf,gradBoost,ada]
regressor_names = ["Linear Regression","Random Forests","Gradient Boosting","Adaboost"]

In [9]:
for regressor,regressor_name in zip(regressors,regressor_names):
    
    regressor.fit(train_data,train_output)
    predicted_values = regressor.predict(test_data)
    predicted = np.clip(predicted_values, 0,5000)

    counter = 0
    predicted = pd.DataFrame(data = predicted_values, index = None, columns         = None)
    top = pd.concat([test_output,predicted], axis=1, sort=False,                    ignore_index=True)
    top = top.sort_values(0, ascending=False)
    for i in range(10):
        if math.ceil(top.iloc[i,0])== math.ceil(top.iloc[i,1]):
            counter = counter+1

    print ("Mean Squared Error for ",regressor_name, " : ", metrics.mean_squared_error(test_output,predicted_values))
    print ("R2 score for ",regressor_name, " : ",metrics.r2_score(test_output,predicted_values))
    print ("HIT@10 for ",regressor_name, " : ",counter)
    print("\n")


Mean Squared Error for  Linear Regression  :  478.7749825030172
R2 score for  Linear Regression  :  0.7003635623528848
HIT@10 for  Linear Regression  :  0


Mean Squared Error for  Random Forests  :  327.3757579329755
R2 score for  Random Forests  :  0.7951152222569546
HIT@10 for  Random Forests  :  0


Mean Squared Error for  Gradient Boosting  :  529.5309609112388
R2 score for  Gradient Boosting  :  0.6685984511517422
HIT@10 for  Gradient Boosting  :  0


Mean Squared Error for  Adaboost  :  437.0213522776495
R2 score for  Adaboost  :  0.7264946457987189
HIT@10 for  Adaboost  :  0




In [None]:
# Select the features from output of the filter method by applying the recursive feature elimination for wrapper method
estimator = rf
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(train_data, train_output)

In [None]:
print(len(selector.ranking_))
ranking = selector.ranking_

In [None]:
# Select features based on the ranking provided by the RFE function such that the rank is 1
selected1 = []
for i in range(len(selected)):
    if ranking[i] ==1:
        selected1.append(selected[i])
print(selected1)

In [None]:
#Load train and test data again (They were modified earlier in the filter method)
filename = "blogData_train.csv"
train_data = pd.read_csv(filename,header=None)
train_output = train_data[len(train_data.columns)-1]
del train_data[len(train_data.columns)-1]

train_data = train_data.iloc[:, selected1]

In [None]:
filename = "blogData_test-2012.02.01.00_00.csv"
test_data = pd.read_csv(filename,header=None)
test_output = test_data[len(test_data.columns)-1]

test_data = test_data.iloc[:, selected1]

In [None]:
for regressor,regressor_name in zip(regressors,regressor_names):
    
    regressor.fit(train_data,train_output)
    predicted_values = regressor.predict(test_data)
    predicted = np.clip(predicted_values, 0,5000)

    counter = 0
    predicted = pd.DataFrame(data = predicted_values, index = None, columns         = None)
    top = pd.concat([test_output,predicted], axis=1, sort=False,                    ignore_index=True)
    top = top.sort_values(0, ascending=False)
    for i in range(10):
        if math.ceil(top.iloc[i,0])== math.ceil(top.iloc[i,1]):
            counter = counter+1

    print ("Mean Squared Error for ",regressor_name, " : ", metrics.mean_squared_error(test_output,predicted_values))
    print ("R2 score for ",regressor_name, " : ",metrics.r2_score(test_output,predicted_values))
    print ("AUC for",regressor_name, " : ",metrics.roc_auc_score(test_output,predicted_values, average = 'samples'))
    print ("HIT@10 for ",regressor_name, " : ",counter)
    print("\n")
