# Dữ liệu chuẩn đoán bệnh tiểu đường

## .. _diabetes_dataset:
## 
## Diabetes dataset
## ----------------
## 
## Ten baseline variables, age, sex, body mass index, average blood
## pressure, and six blood serum measurements were obtained for each of n =
## 442 diabetes patients, as well as the response of interest, a
## quantitative measure of disease progression one year after baseline.
## 
## **Data Set Characteristics:**
## 
##   :Number of Instances: 442
## 
##   :Number of Attributes: First 10 columns are numeric predictive values
## 
##   :Target: Column 11 is a quantitative measure of disease progression one year after baseline
## 
##   :Attribute Information:
##       - age     age in years
##       - sex
##       - bmi     body mass index
##       - bp      average blood pressure
##       - s1      tc, total serum cholesterol
##       - s2      ldl, low-density lipoproteins
##       - s3      hdl, high-density lipoproteins
##       - s4      tch, total cholesterol / HDL
##       - s5      ltg, possibly log of serum triglycerides level
##       - s6      glu, blood sugar level
## 
## Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).
## 
## Source URL:
## https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
## 
## For more information see:
## Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
## (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

Import các thư viên

In [None]:
# import thư viện matploylib để vẽ biều đồ
import matplotlib.pyplot as plt
# import thu viện numpy để xử lý số học
import numpy as np
from sklearn import datasets, linear_model, feature_selection
from sklearn.metrics import mean_squared_error, r2_score


## Hồi quy tuyến tính (Linear Regression)

In [None]:
# Load dữ liệu về
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)


In [None]:
diabetes_X


In [None]:
diabetes_X = diabetes_X[:, np.newaxis, 2]


In [None]:
diabetes_X.reshape(1,len(diabetes_X))

In [None]:
diabetes_y


In [18]:
feature_selection.r_regression(diabetes_X,diabetes_y)

array([0.58645013])

In [19]:
correlation(diabetes_X,diabetes_y)

array([0.58645013])

In [None]:
def correlation(x, y):
    mean_x = sum(x)/float(len(x)) #tb
    mean_y = sum(y)/float(len(y)) #tb
    sub_x = [i-mean_x for i in x] # (x_i - tb x)
    sub_y = [i-mean_y for i in y] # (y_y - tb_y)
    numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))]) # Tử số
    std_deviation_x = sum([sub_x[i]**2.0 for i in range(len(sub_x))]) 
    std_deviation_y = sum([sub_y[i]**2.0 for i in range(len(sub_y))])
    denominator = (std_deviation_x*std_deviation_y)**0.5 
    cor = numerator/denominator
    return cor

In [None]:
diabetes_X


In [None]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]


In [None]:
# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]


In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()


In [None]:
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)


In [None]:
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)


In [None]:
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" %
      mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" %
      r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()


## Hồi quy tăng cường Gradient 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [None]:
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target


In [None]:
print(diabetes['DESCR'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13
)
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}


In [None]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))


In [None]:
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
    np.arange(params["n_estimators"]) + 1,
    reg.train_score_,
    "b-",
    label="Training Set Deviance",
)
plt.plot(
    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()


In [None]:
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(diabetes.feature_names)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Don't plot the sex data
features = diabetes['feature_names']
features.remove('sex')

# Plot
fig, axs = plt.subplots(3, 3)
fig.suptitle('Diabetes Dataset')
for i in range(3):
    for j in range(3):
        n = j + i * 3
        feature = features[n]
        axs[i, j].scatter(diabetes['data'][feature], diabetes['target'], s=1)
        axs[i, j].set_xlabel(feature)
        axs[i, j].set_ylabel('target')
plt.tight_layout()
plt.show()
