### Feature selection using Filter and wrapper meathod based on correlation between the features and choosing top 1% for training and testing

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor
import math, os, random

In [2]:
#Load the train and test data
files = os.listdir("Data")
filename = files[-1]
train_data = pd.read_csv("./Data/{}".format(filename),header=None)
file_list = files[:-1] # test data list

In [3]:
#Using Pearson Correlation
cor = train_data.corr()
#Correlation with output variable
cor_target = abs(cor[280])

In [4]:
#Selecting highly correlated features
relevant_features = cor_target.sort_values(ascending=False)
n_features = 0.1*len(relevant_features)
features = relevant_features.index
features = features[1:]
print(features)

Int64Index([  9,  20,   5,   4,  10,  14,  19,   0,  51,  15,
            ...
            230, 181,  90, 141, 216, 237,  12,  32,  37, 277],
           dtype='int64', length=280)


In [5]:
#Remove features that are highly correlated with each other
selected = []
for i in features:
    temp = i
    for j in range(len(features)):
        temp = temp+1
        if temp > 279:
            continue
        next = features[temp]
        if cor.iloc[i,next]>0.3:
            continue
    selected.append(i)
selected = selected[:28]

In [6]:
#Load the output for the train data
train_output = train_data[len(train_data.columns)-1]
del train_data[len(train_data.columns)-1]
#Select only the selected features in train data
train_data = train_data.iloc[:, selected]
train_data = pd.DataFrame(data=train_data)

In [7]:
# Initialize the models
reg = LinearRegression()
rf = RandomForestRegressor()
# gradBoost = GradientBoostingRegressor()
# ada = AdaBoostRegressor()
#n_estimators=300 add to the regressors to observe the change
regressors = [reg,rf]
# regressor_names = ["Linear Regression","Random Forests","Gradient Boosting","Adaboost"]
regressor_names = ["Linear Regression","Random Forests"]

In [8]:
# Bagging of Linear Regression
regEnsemble = BaggingRegressor(base_estimator=reg, n_estimators=100, max_samples=1.0, random_state=random.randint(0,100))
regressors.append(regEnsemble)
regressor_names.append("Linear Regression Ensemble")

In [9]:
for regressor,regressor_name in zip(regressors,regressor_names):
    if regressor_name != "Linear Regression Ensemble":
        continue
    res = []
    col_names = ["fileName", "Mean Squared Error", "R2 score", "HIT@10"]
    regressor.fit(train_data,train_output)
    for filename in file_list:
        # Load the test data
        test_data = pd.read_csv("./Data/{}".format(filename),header=None)
        test_output = test_data[len(test_data.columns)-1]
        test_data = test_data.iloc[:, selected]
        rank_test = [index for index,value in sorted(list(enumerate(test_output)),key=lambda x:x[1], reverse=True)]
        
        predicted_values = regressor.predict(test_data)
        rank_predict = [index for index,value in sorted(list(enumerate(predicted_values)),key=lambda x:x[1], reverse=True)]
        counter = len([x for x in rank_predict[:10] if x in rank_test[:10]])

        result = [filename, metrics.mean_squared_error(test_output,predicted_values), metrics.r2_score(test_output,predicted_values), counter]
        res.append(result)
    df = pd.DataFrame(data=res, columns=col_names)
    col_mean = df[["Mean Squared Error", "R2 score", "HIT@10"]].mean()
    col_mean["fileName"] = "Average"
    df = df.append(col_mean, ignore_index=True)
    df.to_csv("./{}_2.csv".format(regressor_name), index=False)
    print ("Mean Squared Error for ",regressor_name, " : ", col_mean["Mean Squared Error"])
    print ("R2 score for ",regressor_name, " : ", col_mean["R2 score"])
    print ("HIT@10 for ",regressor_name, " : ", col_mean["HIT@10"])
    print("\n")

Mean Squared Error for  Linear Regression Ensemble  :  642.170459682454
R2 score for  Linear Regression Ensemble  :  -0.008607493060561461
HIT@10 for  Linear Regression Ensemble  :  5.033333333333333


