### No Feature Selection!

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor
import math, os, random 
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
  
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


In [2]:
#Load the train and test data
files = os.listdir("Data")
filename = files[-1]
train_data = pd.read_csv("./Data/{}".format(filename),header=None)
file_list = files[:-1] # test data list
train_output = train_data[len(train_data.columns)-1]
del train_data[len(train_data.columns)-1]

In [3]:
# Initialize the models
reg = LinearRegression()
rf = RandomForestRegressor()
regressors = [reg,rf]
regressor_names = ["Linear Regression","Random Forests"]

In [4]:
# Bagging of Linear Regression
regEnsemble = BaggingRegressor(base_estimator=reg, n_estimators=10, max_samples=0.7, random_state=random.randint(0,100))
regressors.append(regEnsemble)
regressor_names.append("Linear Regression Ensemble")

In [5]:
for regressor,regressor_name in zip(regressors,regressor_names):
    if regressor_name != "Linear Regression Ensemble":
        continue
    res = []
    col_names = ["fileName", "Mean Squared Error", "R2 score", "HIT@10"]
    for filename in file_list:
        # Load the test data
        test_data = pd.read_csv("./Data/{}".format(filename),header=None)
        test_output = test_data[len(test_data.columns)-1]
        del test_data[len(test_data.columns)-1]
        rank_test = [index for index,value in sorted(list(enumerate(test_output)),key=lambda x:x[1], reverse=True)]
        regressor.fit(train_data,train_output)
        predicted_values = regressor.predict(test_data)
        rank_predict = [index for index,value in sorted(list(enumerate(predicted_values)),key=lambda x:x[1], reverse=True)]
        counter = len([x for x in rank_predict[:10] if x in rank_test[:10]])

        result = [filename, metrics.mean_squared_error(test_output,predicted_values), metrics.r2_score(test_output,predicted_values), counter]
        res.append(result)
    df = pd.DataFrame(data=res, columns=col_names)
    col_mean = df[["Mean Squared Error", "R2 score", "HIT@10"]].mean()
    col_mean["fileName"] = "Average"
    df = df.append(col_mean, ignore_index=True)
    df.to_csv("./{}_1.csv".format(regressor_name), index=False)
    print ("Mean Squared Error for ",regressor_name, " : ", col_mean["Mean Squared Error"])
    print ("R2 score for ",regressor_name, " : ", col_mean["R2 score"])
    print ("HIT@10 for ",regressor_name, " : ", col_mean["HIT@10"])
    print("\n")

Mean Squared Error for  Linear Regression Ensemble  :  716.7128479717813
R2 score for  Linear Regression Ensemble  :  -0.26501466376829913
HIT@10 for  Linear Regression Ensemble  :  4.916666666666667




In [6]:
# for regressor,regressor_name in zip(regressors,regressor_names):
#     res = []
#     col_names = ["fileName", "Mean Squared Error", "R2 score", "HIT@10"]
#     for filename in file_list:
#         # Load the test data
#         test_data = pd.read_csv("./Data/{}".format(filename),header=None)
#         test_output = test_data[len(test_data.columns)-1]
#         del test_data[len(test_data.columns)-1]
#         rank_test = [index for index,value in sorted(list(enumerate(test_output)),key=lambda x:x[1], reverse=True)]
#         regressor.fit(train_data,train_output)
#         predicted_values = regressor.predict(test_data)
#         rank_predict = [index for index,value in sorted(list(enumerate(predicted_values)),key=lambda x:x[1], reverse=True)]
#         counter = len([x for x in rank_predict[:10] if x in rank_test[:10]])

#         result = [filename, metrics.mean_squared_error(test_output,predicted_values), metrics.r2_score(test_output,predicted_values), counter]
#         res.append(result)
#     df = pd.DataFrame(data=res, columns=col_names)
#     col_mean = df[["Mean Squared Error", "R2 score", "HIT@10"]].mean()
#     col_mean["fileName"] = "Average"
#     df = df.append(col_mean, ignore_index=True)
#     df.to_csv("./{}123.csv".format(regressor_name), index=False)
#     print ("Mean Squared Error for ",regressor_name, " : ", col_mean["Mean Squared Error"])
#     print ("R2 score for ",regressor_name, " : ", col_mean["R2 score"])
#     print ("HIT@10 for ",regressor_name, " : ", col_mean["HIT@10"])
#     print("\n")