# Bench Mark Setting

## Initial Bench mark: 0.15323

## 1. Loading the data

In [1]:
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

DATA_PATH = 'data'
FILE_NAME = 'train.csv'

def load_data(data_path=DATA_PATH, file_name=FILE_NAME):
    # load everything into data
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    return data

# select all numerical columns only
def num_col(data):
    
    cols = []
    for col in data.columns:
        if data[col].dtype in [int, float, np.int64, np.float64]:
            cols.append(col)
    
    return data[cols]

def fit_nan(data):
    from sklearn.impute import SimpleImputer
    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    return imp.fit_transform(data)
    
        
# splits data in train_data, train_label, test_data, test_label
def split_data(data, ratio=0.9):
    X = data[:,:-1]
    y = data[:,-1]
    
    size = int(X.shape[0] * ratio)
    
    return X[:size], y[:size], X[size:], y[size:]

def pipe_num_data(data_path=DATA_PATH, file_name=FILE_NAME, ratio=0.9):
    return split_data(fit_nan(num_col(load_data(data_path=data_path, file_name=file_name))), ratio)

train_data, train_label, test_data, test_label = pipe_num_data()

## 2. Random Forest Bench Mark

In [2]:
# random forest bench mark
def bench_mark():    
    train_data, train_label, test_data, test_label = pipe_num_data()
    rf = RandomForestRegressor(n_estimators=500)
    rf.fit(train_data, train_label)
    
    ypred = rf.predict(test_data)
    
    return mean_absolute_error(ypred, test_label)
rf = RandomForestRegressor(n_estimators=300)
rf.fit(train_data, train_label)
    
ypred = rf.predict(test_data)
e = np.sqrt(mean_squared_error(np.log(ypred), np.log(test_label)))
print("Random Forest log mean root-mean-squared error is:", e)

Random Forest log mean root-mean-squared error is: 0.1569721014205277


## 3. Bagged Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor

rg = LinearRegression()
lg = BaggingRegressor(rg, n_estimators=500)

lg.fit(train_data, train_label)
ypred = lg.predict(test_data)
e = np.sqrt(mean_squared_error(np.log(ypred), np.log(test_label)))
print("Random Forest log mean root-mean-squared error is:", e)

Random Forest log mean root-mean-squared error is: 0.17152811209670601


## 4. Submittion

In [4]:
train_data, train_label, test_data, test_label = pipe_num_data(ratio=1.0)
train_data, train_label, test_data, test_label = pipe_num_data()
rf = RandomForestRegressor(n_estimators=500)
rf.fit(train_data, train_label)

submit_data = fit_nan(num_col(load_data(data_path=DATA_PATH, file_name='test.csv')))
length = submit_data.shape[0]

pred = rf.predict(submit_data)

In [5]:
submit = pd.DataFrame({'Id': np.arange(1461, 1461+length), 'SalePrice' : pred})
submit.to_csv('01122019.csv', index=False)

## 5 Kaggle Result
Our result is 0.15323