# MLOps homework assignment 2 - Diabetes

First, lets start out by installing and/or importing the necessary libraries.

In [1]:
try:
    import numpy as np
    print('NumPy already installed, only imported')
except:
    !pip install numpy
    import numpy as np
    print('NumPy was not installed, installed and imported')
      
# pyplot as plt
try:
    import matplotlib.pyplot as plt
    print('PyPlot already installed, only imported')
except:
    !pip install matplotlib
    import matplotlib.pyplot as plt
    print('PyPlot was not installed, installed and imported')

# pandas as pd   
try:
    import pandas as pd
    print('pandas already installed, only imported')
except:
    !pip install pandas
    import pandas as pd
    print('pandas was not installed, installed and imported')

try:
    import sklearn
    print('sklearn already installed, only imported')
except:
    !pip install scikit-learn
    import sklearn
    print('sklearn was not installed, installed and imported')

try:
    import seaborn as sns
    print('seaborn already installed, only imported')
except:
    !pip install seaborn
    import seaborn as sns
    print('seaborn was not installed, installed and imported')
    
from sklearn.pipeline import Pipeline

NumPy already installed, only imported
PyPlot already installed, only imported
pandas already installed, only imported
sklearn already installed, only imported
seaborn already installed, only imported


Now, let's load the diabetes dataset and check it out

In [2]:
diabetes_DF = pd.read_csv('diabetes.tsv', sep='\t').rename(columns={
    'AGE': 'age', 'SEX': 'sex', 'BMI': 'bmi', 
    'BP': 'bp', 'S1': 's1', 'S2': 's2', 'S3': 's3', 
    'S4': 's4', 'S5': 's5', 'S6': 's6', 'Y': 'progression'})

#diabetes_DF['sex'] = diabetes_DF['sex'].replace({1: 'Male', 2: 'Female'})


diabetes_DF.head(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,progression
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


## Find top 3 predictive features according to 3 different methods of measuring predictiveness

### Method #1: Feature importance

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeCV

X, y = diabetes_DF.iloc[:, 0:10], diabetes_DF['progression'] 

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y)
importance = np.abs(ridge.coef_)
feature_names = np.array(X.columns)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()

### Method #2: Correlation matrix

In [None]:
correlation_matrix = diabetes_DF.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

### Method #3: Univariate feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

X = diabetes_DF.iloc[:, 0:10]
y = diabetes_DF['progression']

selector = SelectKBest(score_func=f_regression, k=5)  # Select top 5 features
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support(indices=True)]

# Print selected features from univariate selection
print("Selected Features from Univariate Selection:")
print(selected_features)

### Method #4: Mutual Information Scores

In [None]:
from sklearn.feature_selection import mutual_info_regression
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

# Print mutual information scores
print("Mutual Information Scores:")
print(mi_scores)

Now that we have tried various methods for determining the best features, let's take the top three from each one of them and then determine which three appear the most!

Method 1: s1, s5, bmi<br>
Method 2: bmi, s5, bp<br>
Method 3: bmi, s5, bp<br>
Method 4: bmi, s5, s6

From this, we can concluded that the top 3 features are: **bmi**, **s5** and **bp**

## Design 2 new features

Given the fact that the our top three features are bmi, s5 and bp, it would only be appropriate if the two 2 new features were related to these features.

For the 2 new features, I want to choose blood pressure category and bmi category. For the blood pressure category, I want to add 'Normal', 'Elevated', 'High' and 'Extremely High' categories. As for the bmi category, I want to add 'Underweight', 'Normal Weight', 'Overweight' and 'Obese' categories.

In [3]:
diabetes_DF['bp_category'] = pd.cut(diabetes_DF['bp'], bins=[0, 80, 90, 120, 1000],
                           labels=['Normal', 'Elevated', 'High', 'Extremely High'])


diabetes_DF['bmi_category'] = pd.cut(diabetes_DF['bmi'], bins=[0, 18.5, 25, 30, 100],
                            labels=['Underweight', 'Normal Weight', 'Overweight', 'Obese'])

In [4]:
diabetes_DF.head(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,progression,bp_category,bmi_category
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,High,Obese
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,Elevated,Normal Weight
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,High,Obese
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,Elevated,Overweight
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,High,Normal Weight


## Pre-process necessary features

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer

In [16]:
diabetes_DF.isnull().sum()

age             0
sex             0
bmi             0
bp              0
s1              0
s2              0
s3              0
s4              0
s5              0
s6              0
progression     0
bp_category     0
bmi_category    0
dtype: int64

In [134]:
 numeric_features = ['bmi', 's5', 'bp']
#numeric_features = ['s5']
numeric_transformer = Pipeline(steps = [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

In [123]:
#categorical_features = ['bp_category', 'bmi_category']
#categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown="ignore")), ("selector", SelectPercentile(chi2, percentile=50))])

In [135]:
#preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),("cat", categorical_transformer, categorical_features)])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)])
#preprocessor = ColumnTransformer(transformers=[("cat", categorical_transformer, categorical_features)])

## Predict how disease progresses 1 year from datapoint

In [136]:
model = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())])

In [137]:
X = diabetes_DF[['bmi', 's5', 'bp']]
#X = diabetes_DF[['bp_category', 'bmi_category', 's5']]
y = diabetes_DF['progression']

In [138]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
model.fit(X_train, y_train)

In [140]:
data_point = X_test.iloc[0:1]
data_point

Unnamed: 0,bmi,s5,bp
287,25.8,4.9972,90.0


In [141]:
y_pred = model.predict(data_point)
print(f'The predicted progression of the disease after 1 year of the datapoint is: {y_pred}')

The predicted progression of the disease after 1 year of the datapoint is: [159.05042578]


## Report score/accuracy in at least 2 different formats

In [142]:
y_pred = model.predict(X_test)

In [143]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  2891.0372112919654


In [144]:
score = model.score(X_test, y_test)
print(f'The R-Squared score is: {score}')

The R-Squared score is: 0.45433099153843415


BEYOND THIS IS THE OTHER DATASET!!!!

In [None]:
from sklearn.datasets import load_diabetes

#load the diabetes dataset
diabetes_data = load_diabetes()

Lets look a bit about the information of the dataset

In [None]:
diabetes_desc = diabetes_data.DESCR
print(diabetes_desc)

Let's convert the data into a dataframe and read out the first five rows within the data

In [None]:
# transfer the data into a separate dataframe
diabetes_DF = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)

#append the dependent variable to the dataframe
diabetes_DF['progression'] = diabetes_data.target

To briefly visualize the data, lets display the first five rows!

In [None]:
diabetes_DF.head(5)

## Pre-process necessary features

From the looks of it, sci-kit has already done a few pre-propressing techniques on the given dataset. It appears that the data has been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1), as stated by sklearn.

Aside from that, perhaps we can check if there are any missing values and if so, remove them.

In [None]:
diabetes_DF.isna().sum()

## Find top 3 predictive features according to 3 different methods of measuring predictiveness

### Method #1: Feature importance

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeCV

X, y = diabetes_DF.iloc[:, 0:10], diabetes_DF['progression'] 

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y)
importance = np.abs(ridge.coef_)
feature_names = np.array(diabetes_data.feature_names)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()

### Method #2: Correlation matrix

In [None]:
correlation_matrix = diabetes_DF.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

### Method #3: Univariate feature selection

In [None]:
from sklearn.feature_selection import SelectKBest

X = diabetes_DF.iloc[:, 0:10]
y = diabetes_DF['progression']

selector = SelectKBest(score_func=f_regression, k=5)  # Select top 5 features
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support(indices=True)]

# Print selected features from univariate selection
print("Selected Features from Univariate Selection:")
print(selected_features)

### Method #4: Mutual Information Scores

In [None]:
from sklearn.feature_selection import mutual_info_regression
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

# Print mutual information scores
print("Mutual Information Scores:")
print(mi_scores)

Now that we have tried various methods for determining the best features, let's take the top three from each one of them and then determine which three appear the most!

Method 1: s1, s5, bmi<br>
Method 2: bmi, s5, bp<br>
Method 3: bmi, s5, bp<br>
Method 4: bmi, s5, s6

From this, we can concluded that the top 3 features are: **bmi**, **s5** and **bp**

## Design 2 new features

Now, lets design **two** new features to our dataset!

In [None]:
df = pd.read_csv('diabetes.tsv', sep='\t')

df.head(5)

In [None]:
df['BP_CATEGORY'] = pd.cut(df['BP'], bins=[0, 80, 90, 120, 1000],
                           labels=['Normal', 'Elevated', 'High', 'Hypertensive Crisis'])

In [None]:
df['BMI_CATEGORY'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100],
                            labels=['Underweight', 'Normal Weight', 'Overweight', 'Obese'])


In [None]:
df.head(30)

In [None]:
X_original

## Predict how disease progresses 1 year from datapoint

Now, lets try to split our data into X and y variables. We will use **progression** as the dependent y variable and the and the **three selected features** at the independent X variable.

In [None]:
# converting the values to a numpy array (2D)
diabetes_arr = diabetes_DF.values

# X = diabetes_arr[:, 0:10]
# y = diabetes_arr[:, 10]

X = diabetes_DF[['bmi', 's5', 'bp']]
y = diabetes_DF['progression']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer

pipeline = Pipeline([
    ('classifier', LinearRegression())  # Linear Regression model
])

pipeline.fit(X_train, y_train)

Lets pick a random data point to perform our regression algorithm on. For exmaple, let's take the first row.

In [None]:
data_point = X_test.iloc[0].values.reshape(1, -1)

In [None]:
y_pred = pipeline.predict(data_point)
print(f'The predicted progression of the disease after 1 year of the datapoint is: {y_pred}')

## Report score/accuracy in at least 2 different formats

### Mean sqaured error (MSE)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

### R-Squared (R2 / determination coeffient)

In [None]:
score = pipeline.score(X_test, y_test)
print(f'The R-Squared score is: {score}')