# Data Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_table("data/airfoil_self_noise.dat", 
                        header=None, 
                        names=['frequency', 'angle_of_attack', 'chord_length', 'free_stream_velocity', 
                               'suction_side_displacement_thickness', 'scaled_sound_pressure_level'])
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [3]:
X

array([[8.00000e+02, 0.00000e+00, 3.04800e-01, 7.13000e+01, 2.66337e-03],
       [1.00000e+03, 0.00000e+00, 3.04800e-01, 7.13000e+01, 2.66337e-03],
       [1.25000e+03, 0.00000e+00, 3.04800e-01, 7.13000e+01, 2.66337e-03],
       ...,
       [4.00000e+03, 1.56000e+01, 1.01600e-01, 3.96000e+01, 5.28487e-02],
       [5.00000e+03, 1.56000e+01, 1.01600e-01, 3.96000e+01, 5.28487e-02],
       [6.30000e+03, 1.56000e+01, 1.01600e-01, 3.96000e+01, 5.28487e-02]])

In [4]:
y

array([126.201, 125.201, 125.951, ..., 106.604, 106.224, 104.204])

## Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Training 

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [10, 100, 1000],
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'loss': ['ls', 'lad', 'huber', 'quantile'],
    'random_state': [0]
}

regressors = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=params, verbose=10, n_jobs=-1)
regressors.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  

GridSearchCV(estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'mse', 'mae'],
                         'loss': ['ls', 'lad', 'huber', 'quantile'],
                         'max_features': [None, 'auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 100, 1000], 'random_state': [0]},
             verbose=10)

## Predicting the Test set results

In [14]:
regressor = regressors.best_estimator_
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[129.89 130.79]
 [119.37 119.54]
 [110.81 110.32]
 [116.95 117.4 ]
 [127.17 127.62]
 [121.35 121.66]
 [119.43 118.69]
 [127.89 130.03]
 [113.98 114.73]
 [125.33 124.21]
 [125.49 125.84]
 [131.75 131.52]
 [131.29 130.7 ]
 [116.59 117.81]
 [125.51 126.66]
 [114.39 113.14]
 [127.64 126.76]
 [133.41 133.38]
 [117.22 118.08]
 [115.07 116.15]
 [123.84 123.46]
 [130.15 130.09]
 [129.47 129.93]
 [127.08 120.66]
 [132.04 134.06]
 [121.91 122.09]
 [116.18 117.09]
 [139.28 138.76]
 [129.93 130.83]
 [130.82 128.24]
 [130.31 128.95]
 [121.88 122.53]
 [103.87 103.38]
 [128.73 126.41]
 [132.45 131.8 ]
 [130.66 130.96]
 [118.4  119.25]
 [112.3  109.64]
 [126.4  127.78]
 [128.3  129.01]
 [131.34 132.54]
 [124.59 125.48]
 [129.16 138.27]
 [122.25 123.13]
 [109.61 111.03]
 [133.59 135.96]
 [128.02 127.12]
 [128.81 129.67]
 [130.65 125.65]
 [127.81 129.24]
 [126.08 123.21]
 [125.86 127.  ]
 [119.07 119.91]
 [126.48 126.56]
 [121.54 121.53]
 [121.88 122.23]
 [114.73 113.3 ]
 [129.38 128.34]
 [121.73 121.7

## Measuring performance

In [15]:
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

In [16]:
r2_score(y_test, y_pred)

0.9234856271537567

In [17]:
mean_squared_error(y_test, y_pred)

3.6293943443107173

In [18]:
median_absolute_error(y_test, y_pred)

0.9988991031968553