<a href="https://colab.research.google.com/github/vbosstech/disease-diagnostic-from-symptoms/blob/master/heart_disease_risk_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Heart Disease using UCI-repository Data-Set

## In lieu of using the available processed data with 14 attributes in CSV format, I am starting with the complete dataset of 76 attributes. 

   ### Task 1: Put data in dataframe and remove irrelevant columns. 
   ### Task 2: Feature Selection/Addressing missing values
   ### Task 3: Explore data - see which fetaures are relevant. 
   ### Task 4: Check for imbalanced class set.
   ### Task 5: Exploring models 
   ### Task 6: Hyperparameter optimization - fine tuning the selected model.
   ### Task 7: Test the selected model
   ### Task 8: Assess the selected model

In [0]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount("/content/gdrive", force_remount=True)

### Task 1. Getting data together

In [0]:
import io
import requests
import pandas as pd
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/hungarian.data'

df = pd.read_csv(url)
df.head(10) #This is not in the right format!

In [0]:
r = requests.get(url)
#if r.status_code != requests.codes.ok:
   # r.raise_for_status()

data = r.text.replace('\n', ' ').replace(' name ', ' name\n')

hungary = pd.read_table(io.StringIO(data), sep='\s+', header=None)
print(hungary)

In [0]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/long-beach-va.data'
r = requests.get(url)
#if r.status_code != requests.codes.ok:
   # r.raise_for_status()

data1 = r.text.replace('\n', ' ').replace(' name ', ' name\n')

lb = pd.read_table(io.StringIO(data1), sep='\s+', header=None)

In [0]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/switzerland.data'
r = requests.get(url)
#if r.status_code != requests.codes.ok:
   # r.raise_for_status()

data2 = r.text.replace('\n', ' ').replace(' name ', ' name\n')

switz = pd.read_table(io.StringIO(data2), sep='\s+', header=None)

In [0]:
frames = [hungary, lb, switz] #Now I want to merge all 3 dataframes and then I will label the columns
df = pd.concat(frames)
df.head(5)

## Task 2. Feature Selection

In [0]:
col_list = [2,3,8,9,11,14,15,16,17,18,31,32,33,34,37,39,40,43,50,57] 
#Selecting features - for now i'm elminating blank columns (there are a lot of blank columns here)
            
hungary=hungary[col_list]
lb = lb[col_list]
switz=switz[col_list]

frames = [hungary, lb, switz]
df = pd.concat(frames)

In [0]:
df.columns=['age','sex','cp_type','rest_bp','chol','yrs_smoke','fbs','hist_dm','hist_cad','rest_ecg','max_hr','rest_hr',
                'ex_bp1','ex_bp2','exang','oldpeak','slope','ca','thal','outcome']
#These are the new column labels

In [0]:
df.head(5) 

#### Looking for Null values or missing values

In [0]:
df.info() #So far everything is not null, but I'll come back to this as some of the negative numbers above are concerning

In [0]:
max_hr_neg = df[df.max_hr <= 0] 
print(max_hr_neg) 
#Nothing is null so maybe some are negative? #Yup, there are a lot of negative values which don't make
#sense in this context as you can't have a negative heart rate or bp. I will explore/address this later on.

## Task 3. Exploring the data

#### Are age and sex related to heart disease?

In [0]:
fig = plt.figure(figsize=(6,4))

ax1 = fig.add_subplot(211)
ax1.scatter(df.age,df.outcome)
plt.xlabel('age')
plt.ylabel('outcome')
plt.title('Does Age predict outcome?')

ax3 = fig.add_subplot(212)
ax3.scatter(df.max_hr,df.outcome)
plt.xlabel('max HR')
plt.ylabel('outcome')
plt.title('Does max_HR relate to outcome?')
fig.tight_layout()

In [0]:
#Hard to see much with the tradition plot, adding some jitter
import seaborn as sns
fig = plt.figure(figsize=(20,16))

ax1 = fig.add_subplot(331)
a1 = sns.stripplot(df.yrs_smoke,df.outcome, jitter=True)
plt.xlabel('yrs_smoke')
plt.ylabel('outcome')
plt.title('Yrs_smoke vs outcome')

ax2 = fig.add_subplot(332)
a2 = sns.stripplot(df.hist_dm,df.outcome, jitter=True)
plt.xlabel('hist_dm')
plt.ylabel('outcome')
plt.title('History of diabetes v outcome')

ax3 = fig.add_subplot(333)
a3 = sns.stripplot(df.hist_cad,df.outcome, jitter=True)
plt.xlabel('hist_cad')
plt.ylabel('outcome')
plt.title('History of heart disease v outcome')

ax4 = fig.add_subplot(334)
a4 = sns.stripplot(df.slope,df.outcome, jitter=True)
plt.xlabel('slope')
plt.ylabel('outcome')
plt.title('Slope of peak exercise v outcome')
fig.tight_layout()

ax5 = fig.add_subplot(335)
a5 = sns.stripplot(df.ca,df.outcome, jitter=True)
plt.xlabel('ca')
plt.ylabel('outcome')
plt.title('Malformed arteries v outcome')
fig.tight_layout()

ax6 = fig.add_subplot(336)
a6 = sns.stripplot(df.thal,df.outcome, jitter=True)
plt.xlabel('thal')
plt.ylabel('outcome')
plt.title('Blood disorders v outcome')
fig.tight_layout()

ax7 = fig.add_subplot(337)
a7 = sns.stripplot(df.sex,df.outcome, jitter=True)
plt.xlabel('sex')
plt.ylabel('outcome')
plt.title('Sex v outcome')
fig.tight_layout()

ax8 = fig.add_subplot(338)
a8 = sns.stripplot(df.cp_type,df.outcome, jitter=True)
plt.xlabel('CP_type')
plt.ylabel('outcome')
plt.title('CP_type v outcome')
fig.tight_layout()

ax9 = fig.add_subplot(339)
a9 = sns.stripplot(df.ca,df.outcome, jitter=True)
plt.xlabel('Ca')
plt.ylabel('outcome')
plt.title('Ca v outcome')
fig.tight_layout()

In [0]:
#Plotting one-by-one seems tedious. Lets try something different -
# from pandas.tools.plotting import scatter_matrix
from pandas.plotting import scatter_matrix
plot = scatter_matrix(df, figsize=(40, 40))

In [0]:
#Now I'm trying to get a feel for the negative values and what the data looks like if I exclude the negative values

fig = plt.figure(figsize=(6,4))

ax1 = fig.add_subplot(221)
plt.hist(df.max_hr) # Need to find and drop these 0s 
plt.title('Max HR - frequency distribution')
plt.ylabel('HR (beats/min)')

ax2 = fig.add_subplot(222)
plt.hist(df.rest_hr) # Need to find and drop these 0s 
plt.title('Rest HR - frequency distribution')
plt.ylabel('HR (beats/min)')

ax3 = fig.add_subplot(223)
plt.hist(df.age) #Since I imported SNS above, everything now defaults to seaborn style? how do i fix this?
plt.title('Age - frequency distribution')
plt.ylabel('age')

ax4 = fig.add_subplot(224)
max_hr_pos = df[df.max_hr >= 0] #We don't want to eliminate the whole rows for the negative values, we just want to replace with Nan
plt.hist(max_hr_pos.max_hr)
plt.title('New Histogram of max_HR')
plt.ylabel('HR (beats/min)')

fig.tight_layout()

# Something funny is going on with the HRs - why are there negative numbers? Going to explore this


In [0]:
rest_hr_pos = df[df.rest_hr >= 0] #Again just filtering out the negative ones for graphing purposes

In [0]:
#I'm going to divide the data by outcome group 
#I'm just going to explore if there are obvious differences in these features between the groups. 
chd_pos = df[df.outcome >= 1] 
chd_neg = df[df.outcome == 0]
#Going to look at cholesterol and max HR first
fig = plt.figure(figsize =(12,8))
ax = fig.add_subplot(221)
n=4
ind1=np.arange(pd.value_counts(chd_neg['outcome']))
y = chd_pos['chol'] #, chd_neg['chol']]
x = chd_pos.outcome
ax.scatter(y, x)
ax.scatter(chd_neg['chol'], chd_neg['outcome'], cmap='gray')
ax.set_ylim(-.2,4.2)
ax.set_title('Spread of Cholesterol levels in those with and without heart disease')
yTickMarks = ['', 'Heart Disease NO', 'Heart Disease 1', 'Heart Disease 2', 'Heart Disease 3', 'Heart Disease 4']
ax.set_yticklabels(yTickMarks)


ax1 = fig.add_subplot(222)
n=4
ind1=np.arange(pd.value_counts(chd_neg['outcome']))
y = chd_pos['max_hr'] #, chd_neg['chol']]
x = chd_pos.outcome
ax1.scatter(y, x)
ax1.scatter(chd_neg['max_hr'], chd_neg['outcome'], cmap='gray')
ax1.set_ylim(-.2,4.2)
ax1.set_title('Spread of Max Heart Rate in those with and without heart disease')
yTickMarks = ['', 'Heart Disease NO', 'Heart Disease 1', 'Heart Disease 2', 'Heart Disease 3', 'Heart Disease 4']
ax1.set_yticklabels(yTickMarks)

fig.tight_layout()

In [0]:
#Now I'm exploring some of those columns that seemed to have all negative #s 
#like yrs_smoke, hist_dm, hist_cad, slope, ca, and thal
#I'm going to explore some of this columns with -9s to see if they take up the entire column. 

## Task 2 - revisited) Looking for Null values or missing values 
### Now i've determined there are no null values but there are a lot of "-9"s in lieu of null values. 

In [0]:
#I'm going to try a few different ways of feature/selection missing values to see what impact it has on the data

## option 1 - regular data, -9s and 0s present, all data included 
## option 2 - thresholded 50%; discrete variables: change to ohe coded, impute continuous variables
## option 3 - thresholded 50; discrete variables: keep as label coded, impute continuous variables
## option 4 - normalized data - some classifiers like regression and svm require normalized data

#### Option 1:  Regular data - 0s and -9s included instead of NaNs

In [0]:
features = df.iloc[:, :-1]
target = df.outcome

##### Option 2: - thresholded 50%; discrete variables: changed to ohe coding, impute contunous variables

In [0]:
collist1=['age','rest_bp','chol','yrs_smoke','max_hr','rest_hr','ex_bp1','ex_bp2'] #list of not categorical features where -9s and 0s need to be turned into NaNs
collist2 = ['sex','cp_type','fbs','hist_dm','hist_cad','rest_ecg','exang','slope','ca','thal','oldpeak']
#list of not categorical and not catergorical features where -9s only need to be turned into NaNs

In [0]:
df_notcat = df[collist1]
df_notcat = df_notcat[df_notcat > 0] #Replacing the -9s and 0s with NaNs

In [0]:
df_cat = df[collist2]
df_cat = df_cat[df_cat > -9] #Replacing the -9s with NaNs

In [0]:
frames = [df_notcat, df_cat]

In [0]:
df_hasnull = pd.concat(frames, axis=1)
df_hasnull.head()

In [0]:
print (df_hasnull.isnull().sum())
print (len(df_hasnull)) 

In [0]:
df_hasnull = df_hasnull.dropna(axis=1,thresh=308)
df_hasnull.info() 
#Excluding columns with greater than 50% missing values. 
#Even if we impute these values, it won't buy us much.

In [0]:
cont_col_missing = ['rest_bp','chol','max_hr','rest_hr','ex_bp1','ex_bp2','oldpeak']

In [0]:
#I will now go through the columns of missing values and see if there is any relation between the column with missing values
# and any other columns 
def plot_features(x, title):
    plt.figure(figsize=(10,14));
    i = 0
    for col in cont_col_missing:
        i += 1
        plt.subplot(7,2,i)
        plt.scatter(df_hasnull[col], df_hasnull[x])
        plt.title(title % (col, x))
        plt.tight_layout()

In [0]:
for x in cont_col_missing:
    plot_features(x, 'Relationship of %s and %s')
#columns with missing valuesplot_features('rest_bp', 'Relationship of %s and rest bp')

In [0]:
# Looking for possible linear relationships from graphs above

# !pip3 install --upgrade -q statsmodels

# import statsmodels.formula.api as smf
# import statsmodels.api as sm

## TODO
# from pandas.stats.api import ols
# res = ols(y=df_hasnull.ex_bp2, x=df_hasnull.ex_bp1).fit()
# res

In [0]:
## TODO
# from pandas.stats.api import ols
# res = ols(y=df_hasnull.rest_hr, x=df_hasnull.max_hr)
# res

In [0]:
cat_feat = ['sex','cp_type','fbs', 'rest_ecg', 'exang', 'slope']

In [0]:
for col in df_hasnull:
    if col not in cat_feat:
        df_hasnull.loc[:,col].fillna(df_hasnull[col].mean(), inplace=True)

In [0]:
df_hasnull_ohe = df_hasnull.copy()

In [0]:
df_ohe = df_hasnull_ohe.fillna(0)
df_ohe = pd.get_dummies(df_ohe, columns=cat_feat, dummy_na=True)
df_ohe.head()

In [0]:
features_ohe = df_ohe

##### Option 3: thresholded 50; discrete variables: label coding, impute contunous variables

In [0]:
#Typically, random forest methods and other methods encourage two ways of handling missing values if we don't know
#anything about the data: The first step is to look for relationships with the data - can one variable predict another? 
#a) drop data points with missing values (not recommended); 
#b) fill in missing values with the median (for numerical values) or mode (for categorical values). 

In [0]:
df_hasnull_label = df_hasnull.copy() #Already thresholded and continous variables have been imputed

In [0]:
cat_col_missing = ['fbs','rest_ecg','exang','slope'] #list of the categorical features with missing values 

In [0]:
for col in cat_col_missing:
        df_hasnull_label.loc[:,col].fillna(df_hasnull_label[col].mode().iloc[0], inplace=True)

In [0]:
features_label = df_hasnull_label

##### Option 4: normalized data - some classifiers like regression and svm require normalized data

In [0]:
collist1=['age','rest_bp','chol','max_hr','rest_hr',
                'ex_bp1','ex_bp2','oldpeak']
df_norm1 = df_hasnull_label[collist1]

In [0]:
collist2 = ['sex','cp_type','fbs','rest_ecg','exang','slope']
df_categorical = df_hasnull_label[collist2]

In [0]:
df_norm2 = (df_norm1 - df_norm1.mean()) /  (df_norm1.std())
df_norm2.head()

In [0]:
df_categorical.head()

In [0]:
normdata = [df_categorical, df_norm2]

In [0]:
df_norm = pd.concat(normdata, axis=1)
df_norm.head()

In [0]:
features_norm = df_norm

## Further exploration - what features are important for distinguishing heart disease from no heart disease?

In [0]:
#I'm just going to turn this into a binary problem where 0 is no heart disease and 1 means 
# the patient developed heart disease (irrespective of the type of heart disease)

In [0]:
target_binary = target.replace(to_replace=[2,3,4], value=1,)
target_binary.value_counts()

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
#I'd like to know the important features that distinguish heart disease from no heart disease. I don't care about accuracy
#so much here.
dtree = DecisionTreeClassifier(max_depth=3)
log = LogisticRegression(class_weight='balanced')
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import cross_val_score 

In [0]:
X = features_ohe
y = target_binary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [0]:
clf = dtree.fit(X_train, y_train)
cross_val_score(clf, X_train, y_train, cv=5)

In [0]:
clf.feature_importances_

In [0]:
feature_cols = features_ohe.columns

In [0]:
len(X_train)

In [0]:
from sklearn.tree import export_graphviz
with open("/content/gdrive/My Drive/machine-learning/disease-diagnostic-from-symptoms/dot-files/tree.dot", 'wb') as f:
    f = export_graphviz(clf, out_file=f, feature_names=feature_cols)

<img src="tree2.png">

## Task 4. Check for imbalanced class set.

In [0]:
df.outcome.value_counts(normalize=True)

## Task 5. Exploring models 

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import cross_val_score 
from sklearn.metrics import (auc, roc_curve, roc_auc_score,
                             accuracy_score, precision_score,
                             recall_score, f1_score, )
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [0]:
logistic = LogisticRegression(C=1, multi_class='ovr', solver='lbfgs', class_weight='balanced')
treeclf = DecisionTreeClassifier(class_weight='balanced')
forest = RandomForestClassifier(class_weight='balanced')
svc = SVC(class_weight='balanced')
svc_lin = SVC(class_weight='balanced', kernel='linear')
knn = KNeighborsClassifier()

In [0]:
X = features
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
#This is using option 1 above - all data left in, no imputation

In [0]:
def classifier_metrics_accuracy(clf, title):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

    clf.fit(X_train, y_train)

    print ('%30s: %s' % ('Default score (accuracy)', clf.score(X_train, y_train)) )
    print ('%30s: %s' % ('Cross val score', cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()) )

In [0]:
def classifier_metrics_roc(clf, title):
    y1 = label_binarize(target, classes=[0, 1, 2, 3, 4])
    n_classes = y1.shape[1]
    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=.2, random_state=1)

    clf1 = OneVsRestClassifier(clf).fit(X_train, y_train)
    #clf1.fit(X_train, y_train)
    
    print (title)
    for i in range(n_classes):

        print ( 'AUC for class %s is %s:' % (i, (cross_val_score(clf1, X_train, y_train[:,i], cv=5, scoring='roc_auc').mean())) )
        print ( 'Precision for class %s is %s:' % (i, (cross_val_score(clf1, X_train, y_train[:,i], cv=5, 
                                                                        scoring='precision_weighted').mean())) )
        print ( 'Recall for class %s is %s:' % (i, (cross_val_score(clf1, X_train, y_train[:,i], cv=5, 
                                                                        scoring='recall_weighted').mean())) )
        print ( 'F1 for class %s is %s:' % (i, (cross_val_score(clf1, X_train, y_train[:,i], cv=5, 
                                                                        scoring='f1_weighted').mean())) )
        print ( '\n' )
        

In [0]:
for clf in [logistic, treeclf, forest, svc, knn, svc_lin]:
    title = 'Data includes missing values - no imputation' 
    print ('Metrics for %s' % clf)
    print ('=' * 50)
    classifier_metrics_roc(clf, title)
    classifier_metrics_accuracy(clf, title)
    print ('\n')

In [0]:
model = DecisionTreeClassifier()

In [0]:
maxdepth = []
for i in range(1,20):
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    maxdepth.append(model.score(X_train,y_train))

In [0]:
fig = plt.figure(figsize=(6,4))
plt.plot(range(1,20),maxdepth)
plt.title('Overfitting of decision tree')
plt.xlabel('max depth')
plt.ylabel('accuracy score')
plt.show()

In [0]:
X = features_ohe
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
#This is using option 2 above - ohe coding, continuous data imputed

In [0]:
for clf in [logistic, treeclf, forest, svc, knn]:
    title = 'OHE labeled, continuous data imputed' 
    print ('Metrics for %s' % clf)
    print ('=' * 50)
    classifier_metrics_accuracy(clf, title)
    print ('\n')
    classifier_metrics_roc(clf, title)
    print ('\n')

In [0]:
X = features_label
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
#This is using option 3 above - label coding, continuous data imputed

In [0]:
for clf in [logistic, treeclf, forest, svc, knn]:
    title = 'Label coded, continuous data imputed' 
    print ('Metrics for %s' % clf)
    print ('=' * 50)
    classifier_metrics_roc(clf, title)
    classifier_metrics_accuracy(clf, title)
    print ('\n')

In [0]:
X = features_norm
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
#This is using option 4 above - normalized continous data, imputed discrete data

In [0]:
for clf in [logistic, treeclf, forest, svc, knn]:
    title = 'Normalized data' 
    print ('Metrics for %s' % clf)
    print ('=' * 50)
    classifier_metrics_roc(clf, title)
    classifier_metrics_accuracy(clf, title)
    print ('\n')

In [0]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from scipy import interp
from itertools import cycle

### *Moving forward with OHE labeled data and RF model*

## Task 6. Hyperparameter optimizaton

In [0]:
## Tuning selected model

In [0]:
X = features_ohe
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [0]:
PARAMETERS = {'max_features':['auto','sqrt',0.2], 'max_leaf_nodes':[None,5,6,7,8,9,10,50], 
              'min_samples_leaf':[1,2,4,50], 'criterion':['gini','entropy'], 'n_estimators':[6,8,10,20,25]}
SCORING = 'accuracy'

from sklearn import grid_search

#Grid Search
model = RandomForestClassifier(class_weight='balanced')
clf = grid_search.GridSearchCV(model, PARAMETERS, scoring=SCORING, verbose=True)
clf.fit(X_train, y_train)

#After completion, show the final best results and scores
print (clf.best_estimator_)
print (clf.best_score_)

## Task 7. Test the selected model

In [0]:
rf_model = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
X = features_ohe
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [0]:
clf = rf_model.fit(X_train, y_train)
y_eval = clf.predict(X_test)

In [0]:
clf.feature_importances_

In [0]:
features_ohe.columns

In [0]:
y1 = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
n_classes = y1.shape[1]

In [0]:
y2 = label_binarize(y_eval, classes=[0, 1, 2, 3, 4])

## Task 8. Assessing Model: 

#### Accuracy

In [0]:
print ("Overall model accuracy is %s" % (accuracy_score(y_test, y_eval)) )

In [0]:
for i in range(n_classes):
    print ("Class %s" % i )
    print (accuracy_score(y1[:, i], y2[:, i]) )
    print ('\n' )

#### Classification Report

In [0]:
from sklearn.metrics import classification_report

In [0]:
for i in range(n_classes):
    print ("Class %s" % i )
    print (classification_report(y1[:, i], y2[:, i], labels=[1, 0]) )

In [0]:
print (f1_score(y_test, y_eval, average=None) )

In [0]:
f1_test = [0.76470588, 0.11764706, 0.34042553, 0.4, 0.13333333]
f1_train = [.7745, .7101, .7842, .7956, .9206]
y = [0, .2, .4, .6, .8]

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)

plt.scatter(y, f1_test, color='red', label='Test')
plt.scatter(y, f1_train, color='green', label='Train')
classes=[0, 0, 1, 2, 3, 4]
xTickMarks = ['Class '+ str(i) for i in classes]
xtickNames = ax.set_xticklabels(xTickMarks, fontsize=8)
plt.title('F1 score for training and test data')
plt.ylabel('F1 score')
plt.legend(loc='lower left')
plt.show()

#### ROC

In [0]:
for i in range(n_classes):
    print ("Class %s" % i )
    print (roc_auc_score(y1[:,i], y2[:,i], average='weighted') )
    print ('\n' )

###### Confusion matrix

In [0]:
values, counts = np.unique(y_eval, return_counts=True)
print (values)
print (counts)

In [0]:
values, counts = np.unique(y_test, return_counts=True)
print (values)
print (counts)

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
X = features_ohe
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
#This is using option 2 above - ohe coding, continuous data imputed

In [0]:
conf_forest = confusion_matrix(y_test, y_eval)

In [0]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix. """
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [0]:
plot_confusion_matrix(conf_forest, classes=[0, 1, 2, 3, 4], title = 'Confusion matrix-Logistic Regression')
plt.show()