# Machine Learning Assignment

**Dataset**:       DIABETES


## Imports

Add imports here as needed.

Remember to **re-run the cell when you add imports**, so it gets loaded into the virtual notebook environment!

In [1]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import DBSCAN

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz

# Utils
import pprint
import numpy as np
from time import time
import openpyxl

## Loading the dataset

In [2]:
import pandas as pd

df = pd.read_csv('data/diabetes_dataset_100k.csv')

# printing out dataset info
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 99090 non-null   float64
 1   gender               99097 non-null   object 
 2   age                  99095 non-null   float64
 3   location             99096 non-null   object 
 4   race                 99099 non-null   float64
 5   hypertension         99085 non-null   float64
 6   heart_disease        99102 non-null   float64
 7   smoking_history      99096 non-null   object 
 8   bmi                  99159 non-null   float64
 9   hbA1c_level          79027 non-null   float64
 10  blood_glucose_level  69458 non-null   float64
 11  diabetes             100000 non-null  int64  
dtypes: float64(8), int64(1), object(3)
memory usage: 9.2+ MB
None


Unnamed: 0,year,gender,age,location,race,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020.0,Female,32.0,Alabama,5.0,0.0,0.0,never,27.32,,100.0,0
1,2015.0,Female,29.0,Alabama,2.0,0.0,0.0,never,19.95,,90.0,0
2,2015.0,Male,18.0,Alabama,5.0,0.0,0.0,never,23.76,4.8,160.0,0
3,2015.0,Male,41.0,Alabama,3.0,0.0,0.0,never,27.32,4.0,,0
4,2016.0,Female,52.0,Alabama,1.0,0.0,0.0,never,23.75,6.5,,0


In [3]:
df.describe()

Unnamed: 0,year,age,race,hypertension,heart_disease,bmi,hbA1c_level,blood_glucose_level,diabetes
count,99090.0,99095.0,99099.0,99085.0,99102.0,99159.0,79027.0,69458.0,100000.0
mean,2018.360067,41.885719,2.995146,0.074734,0.039464,25.936491,5.527778,138.019868,0.085
std,1.345802,22.518101,1.416911,0.262963,0.194698,8.811963,1.069702,40.60844,0.278883
min,2015.0,0.08,1.0,0.0,0.0,0.0,3.5,80.0,0.0
25%,2019.0,24.0,2.0,0.0,0.0,22.63,4.8,100.0,0.0
50%,2019.0,43.0,3.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,2019.0,60.0,4.0,0.0,0.0,29.22,6.2,159.0,0.0
max,2022.0,80.0,5.0,1.0,1.0,95.69,9.0,300.0,1.0


## Exploratory data analysis

Section for exploratory data analysis, to address **Tasks 1.1 and 1.2**.

**OBS:** You may need to do some data cleaning before you do your full exploratory data analysis, though you will find that some functions we'll cover in this unit are able to handle things like missing values and non-numeric data.

**Create more cells as needed!**

## Task 2.1 - ML Workflow to Critically Evaluate

In [4]:
# Dropping all rows with missing values
df = df.dropna()

# Converting all non-numeric (object) features to numeric
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category')) # converting 'object' columns to 'category' type
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes) # converting the 'category' columns to integer encoded values

In [5]:
# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]

# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X, # feature matrix
                                                    y, # label vector
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=None
                                                   )

# Creating and testing a Logistic Regression Model
model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)

# Testing the model
y_pred = model.predict(X_test)

# Printing out confusion matrix and accuracy
print(metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))

[[10025   114]
 [  347   577]]

Accuracy (Testing):  0.96 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Task 2.3 - Evaluation of Improved ML Workflow

Add code for running your **improved** machine learning experiments below.


In [6]:
# Loading the dataset again, for you to do your own pre-processing (instead of what was done above)
df = pd.read_csv('data/diabetes_dataset_100k.csv')