<a href="https://colab.research.google.com/github/natnuo/algoverse-ai-intro/blob/main/AlgoverseLab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lab Specification: https://docs.google.com/document/d/1e-zwfNoTXTPPL6OA6uHI2j3IRI7by6ULjJY4WL4HgJ8/edit

Partner Project with Yashwant

In [12]:
from sklearn.datasets import load_diabetes

# Load the dataset
diabetes_data = load_diabetes()

# Get the data description
print(diabetes_data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(diabetes_data.data, diabetes_data.target, test_size=0.20, random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Linear Regression With Feature Engineering": Pipeline([
        ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('lin_reg', LinearRegression())
    ]),
    "Stochastic Gradient Descent": SGDRegressor(random_state=42, max_iter=10000),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Random Forest With Feature Engineering": Pipeline([
        ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(random_state=42))
    ]),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Elastic Net Regression": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Support Vector Regression": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "Extra Trees Regression": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "AdaBoost Regression": AdaBoostRegressor(random_state=42),
    "K-Nearest Neighbors Regression": KNeighborsRegressor(n_neighbors=5)
}

In [16]:
import pandas as pd

In [17]:
results = pd.DataFrame(columns=['Model', 'R^2', 'MSE', 'MAE'])

for model_name in models:
  print("Running", model_name)

  model = models[model_name]
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  results.loc[len(results)] = [
      model_name,
      r2_score(y_test, y_pred),
      mean_squared_error(y_test, y_pred),
      mean_absolute_error(y_test, y_pred)
  ]

print("Done")

Running Linear Regression
Running Linear Regression With Feature Engineering
Running Stochastic Gradient Descent
Running Decision Tree
Running Random Forest
Running Random Forest With Feature Engineering
Running Ridge Regression
Running Lasso Regression
Running Elastic Net Regression
Running Support Vector Regression
Running Extra Trees Regression
Running Gradient Boosting Regression
Running AdaBoost Regression
Running K-Nearest Neighbors Regression
Done


In [18]:
def highlight(s, f):
    selected = s == f()
    return ['font-weight: bold; background-color: yellow;' if v else '' for v in selected]

def highlight_max(s):
    return highlight(s, s.max)

def highlight_min(s):
    return highlight(s, s.min)

In [19]:
styled_results = results.style.apply(highlight_max, subset=results.columns[1]).apply(highlight_min, subset=results.columns[2:])
styled_results

Unnamed: 0,Model,R^2,MSE,MAE
0,Linear Regression,0.452603,2900.193628,42.794095
1,Linear Regression With Feature Engineering,0.408236,3135.252859,43.650379
2,Stochastic Gradient Descent,0.459821,2861.948433,43.053987
3,Decision Tree,0.060654,4976.797753,54.52809
4,Random Forest,0.442823,2952.010589,44.053034
5,Random Forest With Feature Engineering,0.462027,2850.262342,44.402022
6,Ridge Regression,0.419153,3077.415939,46.138858
7,Lasso Regression,0.471855,2798.193485,42.854428
8,Elastic Net Regression,0.098654,4775.466767,60.349843
9,Support Vector Regression,0.182114,4333.285955,56.023724


In [20]:
import pandas as pd

df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)

df['target'] = diabetes_data.target

df.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
age,1.0,0.173737,0.185085,0.335428,0.260061,0.219243,-0.075181,0.203841,0.270774,0.301731,0.187889
sex,0.173737,1.0,0.088161,0.24101,0.035277,0.142637,-0.37909,0.332115,0.149916,0.208133,0.043062
bmi,0.185085,0.088161,1.0,0.395411,0.249777,0.26117,-0.366811,0.413807,0.446157,0.38868,0.58645
bp,0.335428,0.24101,0.395411,1.0,0.242464,0.185548,-0.178762,0.25765,0.39348,0.39043,0.441482
s1,0.260061,0.035277,0.249777,0.242464,1.0,0.896663,0.051519,0.542207,0.515503,0.325717,0.212022
s2,0.219243,0.142637,0.26117,0.185548,0.896663,1.0,-0.196455,0.659817,0.318357,0.2906,0.174054
s3,-0.075181,-0.37909,-0.366811,-0.178762,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697,-0.394789
s4,0.203841,0.332115,0.413807,0.25765,0.542207,0.659817,-0.738493,1.0,0.617859,0.417212,0.430453
s5,0.270774,0.149916,0.446157,0.39348,0.515503,0.318357,-0.398577,0.617859,1.0,0.464669,0.565883
s6,0.301731,0.208133,0.38868,0.39043,0.325717,0.2906,-0.273697,0.417212,0.464669,1.0,0.382483


In [21]:
from sklearn.tree import DecisionTreeClassifier

X = df.drop('target', axis=1)
y = df['target']