In [3]:
from sklearn import datasets
import numpy as np
import pandas as pd
import bokeh
from bokeh.plotting import output_notebook

from datascienceutils import analyze
from datascienceutils import explain
from datascienceutils import predictiveModels as pm

output_notebook()
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]


In [5]:
diabetes)

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ..., 
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [None]:
df = pd.DataFrame(diabetes.data, columns=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'])
target = diabetes.target
analyze.correlation_analyze(df, 'age', 'bmi')

In [None]:

# Train the model using the training sets
lin_model = pm.train(diabetes_X_train, diabetes_y_train, 'LinearRegression')

print('Coefficients: \n', lin_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((lin_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin_model.score(diabetes_X_test, diabetes_y_test))

explain.interpret(df, lin_model)

In [None]:
# Train the model using the training sets
log_model = pm.train(diabetes_X_train, diabetes_y_train, 'logisticRegression')

#print('Coefficients: \n', log_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((log_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % log_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
rf_model = pm.train(diabetes_X_train, diabetes_y_train, 'randomForest')

#print('Coefficients: \n', rf_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((rf_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % rf_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
sgd_model = pm.train(diabetes_X_train, diabetes_y_train, 'sgd')
sgd_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((sgd_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % sgd_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
xgb_model = pm.train(diabetes_X_train, diabetes_y_train, 'xgboost')
xgb_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((xgb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % xgb_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
svm_model = pm.train(diabetes_X_train, diabetes_y_train, 'svm')
svm_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((svm_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % svm_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
bnb_model = pm.train(diabetes_X_train, diabetes_y_train, 'bernoulliNB')
bnb_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((bnb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % bnb_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
knn_model = pm.train(diabetes_X_train, diabetes_y_train, 'knn')
knn_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((knn_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % knn_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
kde_model = pm.train(diabetes_X_train, diabetes_y_train, 'kde')
kde_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
kde_model = pm.train(diabetes_X_train, diabetes_y_train, 'kde')
kde_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Train the model using the training sets
mnb_model = pm.train(diabetes_X_train, diabetes_y_train, 'multinomialNB')

print('Coefficients: \n', mnb_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((mnb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % mnb_model.score(diabetes_X_test, diabetes_y_test))

In [None]:
X, y = datasets.load_diabetes(return_X_y=True)
X.shape

## Linear Regression is top with MSE: 2548.07
## But we know this is a linear regression data set in the first place

## Of the non-linear  models
## Clearly xgboost takes the cake with MSE: 4906 runs in 5.94s
## Followed by knn MSE: 5640.65


## I heard about [lightgbm](https://github.com/ArdalanM/pyLightGBM)  and wanted to try it. 
## So check it out MSE: 5066.17 and runs in 194ms

## Wow that's multiple orders of magnitude faster and only about 10% more error.. May be lightgbm will work very well for linear patterns. Need to check for other patterns and if it keeps similar trade-offs, then it'll change the market