### Selecting k best features
   - one of the scoring functions (f_regression for F-values)
   - Linear model for testing the individual effect of each of many regressors
   - Precision - ability of classifier NOT to label as positive a sample that is negative
   - Recall - ability of the classifier to find all the positive samples
   - F-score = 2*(precision * recall)/(precision + recall)

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn import datasets
from sklearn.feature_selection import SelectKBest, f_regression 

from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score, scorer



In [None]:
boston_dataset = datasets.load_boston()

In [None]:
X_full = boston_dataset.data
y = boston_dataset.target

print(X_full.shape)
print(y.shape)

In [None]:
print(boston_dataset.feature_names)

## SelectKBest(score_func, k = 10)
 - Select features according to the k highest scores

In [None]:
# Select the top two features to use for Linear Regression

selector = SelectKBest(f_regression, k=2)
selector.fit(X_full, y)

In [None]:
selector.get_support()

In [None]:
print(boston_dataset.feature_names[selector.get_support()])

In [None]:
selector.scores_

In [None]:
X = X_full[:, selector.get_support()]
print(X.shape)

In [None]:
def plot_scatter(X,Y,R=None):
    plt.scatter(X, Y, s=32, marker='o', facecolors='none', edgecolors='k')
    if R is not None:
        plt.scatter(X, R, color='red', linewidth=0.5)
    plt.show()    

In [None]:
plot_scatter(X[:,0], y)

In [None]:
plot_scatter(X[:,1], y)

In [None]:
regressor = LinearRegression(normalize=True).fit(X, y)
y_pred    = regressor.predict(X)

In [None]:
plot_scatter(X[:,0], y, y_pred)

In [None]:
plot_scatter(X[:,1], y, y_pred)

In [None]:
print("R-squared score: {:.4f}".format(
    r2_score(y, y_pred)))