# Hands on

In [1]:
%pylab inline
import pandas as pd
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler

Populating the interactive namespace from numpy and matplotlib


## Download the dataset

In [None]:
! conda install -n MLPB wget
wget https://doi.org/10.1371/journal.pone.0183228.s006 -O Metabolite_patterns_predicting_sex_and_age.xlsx
! mkdir ../input
! mv Metabolite_patterns_predicting_sex_and_age.xlsx ../input

## Load the data into memory

In [11]:
filename = '../input/Metabolite_patterns_predicting_sex_and_age.xlsx'

In [14]:
Y = pd.read_excel(filename, usecols=[0,1,2], skiprows=[0,1,2,3,5], skipfooter=7).age.values
Y

In [15]:
X = pd.read_excel(filename, header=3, skipfooter=7, usecols=range(4,441), skiprows=[4,5])
X

# EDA
Spend a few minutes on getting familiar with the dataset.

In [None]:
X.info

In [None]:
X.types

In [None]:
X.mean()

In [None]:
X.std()

In [None]:
X.isnull().any()

In [None]:
X.fillna(0)
X.isna().any()

## Plot histograms

### Option 1

In [None]:
hist(Y, bins=20)
title('Histogram of the target value $y$')
ylabel('Count')
grid()

### Option 2

In [None]:
sns.distplot(Y)

In [None]:
fig, axes = subplots(6,5, figsize=(10,4), sharey=True)
axes = axes.flatten()

df = X.copy()
df['y'] = Y

for i, col in enumerate(column_names):
    sns.scatterplot(col, 'y', ax=axes[i], data=df, s=10)
    
suptitle('Relationship of features with $y$', y=1.03)

#### Replace the missing values in X with the column-wise median

In [None]:
X.isnull().any()

In [None]:
X.isnull().sum()

Replace values with median

In [None]:
X = X.fillna(X.median())

Check values are replaced:

In [None]:
X.isnull().any()

#### Scale the features in X so that the column-wise mean is 0 and the standard deviation is 1.
Use scikit-learn `StandardScaler()` for this with the code given below.

In [233]:
X.loc[:, :] = StandardScaler().fit_transform(X)

#### Validate that the scaling worked.

In [None]:
X.std()

In [None]:
X.mean()

In [None]:
sns.boxplot(data=X.mean())
xlabel('Mean')
ylabel('Value')

### Train a Linear Model using a train-test split of 0.8 / 0.2

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.20, random_state=23463)

# print the shape of each one of the np.arrays 
print(f'X_train: {X_train.shape} y_train: {y_train.shape}')  # training subsets
print(f'X_test:  {X_test.shape} y_test:  {y_test.shape}')   # test subsets

In [45]:
model = LinearRegression()

In [46]:
# Train the model using the training sets
model=model.fit(X_train, y_train)

#### Plot the prediction over the true values for the test set

In [None]:
# Make predictions using the testing set
y_predict = model.predict(X_test)
y_predict

In [None]:
figure(figsize=(5,5))

scatter(y_test, model.predict(X_test), label='Test')

title('Linear Regression')
xlabel('True')
ylabel('Prediction')

In [51]:
# Define rmse
def rmse(true, pred):
    return sqrt(mean_squared_error(true, pred))

In [None]:
# Score 
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f'Train score: {train_score:8.2f}\nTest score:  {test_score:8.2f}')
print('Test RMSE: {:2.2f}'.format(rmse(y_test, y_predict)))

### Train a Lasso model and vary the regularization paramter $\alpha$ from 0 to 5

In [63]:
from sklearn.linear_model import Lasso



#### Plot the model scores over alphas

#### Retrain a model with the best value for $\alpha$

#### Create a dataset from non-zero features and add the values for age
Which featurs are most important to predict the age?

In [65]:
df_reduced = X.loc[:, model.coef_ > 0].copy()
df_reduced['age'] = Y

In [None]:
df_reduced

In [None]:
df_reduced.columns.tolist()[np.argmax(model.coef_[model.coef_ > 0])]

## Plot the correlation

In [None]:
import seaborn as sns 
figure(figsize=(15,12))
sns.heatmap(df_reduced.corr(),vmin=-1,vmax=1,cmap=cm.bwr)
title('Correlation plot')