# The Weights and Height Regression Project

And in this second part of the project I'm going to use the same data to solve a regression
problem and to predict the weight based on gender and height

Let's first import all the needed libraries

In [1]:
from matplotlib import pyplot

import numpy
from pandas import read_csv,get_dummies, set_option
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

"""Regression Algorithms"""

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [2]:
filename = 'weight-height.csv'
Dataset = read_csv(filename)

The Gender feature needs to be encoded, let's apply the lable encoder

In [3]:
# Let's Encode the Gender data:

encoder = LabelEncoder()
Dataset.Gender = encoder.fit_transform(Dataset.Gender)

array = Dataset.values
X = array[:,:-1]
Y = array[:,-1]

## The machine learning part

In [4]:
validation_size = 0.2
seed = 7
n_folds = 10
scoring = 'neg_mean_squared_error'

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,
        test_size=validation_size, random_state=seed)

In [5]:
# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR(gamma='auto')))

# Evaluate each model in turn

results = []
names = []
for name,model in models:
    kfold = KFold(n_splits = n_folds, random_state = seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {cv_results.mean()}, {cv_results.std()}')    

LR: -99.59864464664562, 5.169689458450589
LASSO: -106.06155775373921, 6.00807679425553
EN: -132.39396582957892, 7.953290603226527
KNN: -120.18831116719147, 7.12724022934497
CART: -200.04302637382483, 7.92584387774294
SVR: -115.8841658789775, 8.506809479938362


Looks like the Linear regression is the best performing algorithm, But I'm going 
to try to tune the Lasso Algorithm just to make sure of it's maximal performance

In [6]:
## Lasso Algorithm tuning
alpha_values = numpy.array([0.1,0.3,0.5,0.7,0.9,1.0,1.3,1.5,1.7,2.0])
param_grid = dict(alpha=alpha_values)
model = Lasso()
kfold = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")


Best: -99.66352255776815 using {'alpha': 0.1}
-99.66352255776815 (5.208276111732531) with: {'alpha': 0.1}
-100.18081765615591 (5.317747092665417) with: {'alpha': 0.3}
-101.214828431896 (5.468405617195191) with: {'alpha': 0.5}
-102.7655548849884 (5.657736314159434) with: {'alpha': 0.7}
-104.83351589710358 (5.882979414563681) with: {'alpha': 0.9}
-106.06155775373918 (6.00807679425553) with: {'alpha': 1.0}
-110.51960757910737 (6.430416534616414) with: {'alpha': 1.3}
-114.13747021460814 (6.747422882571577) with: {'alpha': 1.5}
-118.27230635373597 (7.090345835854821) with: {'alpha': 1.7}
-125.44322979889597 (7.649535416840401) with: {'alpha': 2.0}


Close, but the linear regression is still the best performing algorithm

## Prediction Time

In [7]:
# prepare the model
model = LinearRegression()
model.fit(X_train, Y_train)

# transform the validation dataset

predictions = model.predict(X_validation)
print(r2_score(Y_validation, predictions))

0.8974414869124636
