# Data Exploration 05

In [None]:
import datetime
now = datetime.datetime.now()
print(f'Version: {now.strftime("%Y-%m-%d %H:%M:%S")}')

# Modelling

In [None]:
# import dataset
import pandas as pd

dataset = './data/dataset_benchmark.csv'

df = pd.read_csv(dataset)

In [None]:
# recall the number of state changes within the total dataset
import numpy as np
import seaborn as sns

def get_state_changes(df):
    cond = df['diagnosis'] != df['target_diagnosis']
    changes = (df['diagnosis'] + ' -> ' + df['target_diagnosis']).where(cond, np.nan)
    return changes

df_changes = get_state_changes(df)

print(f'total n state changes: {df_changes.notna().sum()}')
print(df_changes.value_counts())
sns.countplot(df_changes)

In [None]:
# select featureset and import the training data
from utils import utils

# featureset = './features/features_banchmark.yaml'
featureset = './features/features_custom_2.yaml'

df, x, y, x_train, x_test, y_train, y_test = utils.load_benchmark_dataset(dataset, featureset, 
                                                                          dropna=True, only_state_changes=False)
classes = ['CN', 'MCI', 'AD']

# Classification

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

C = 1
# C = 1000000

scaled_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(C=C, class_weight='balanced',random_state=42, probability=True))
])

scaled_svc.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

preds = scaled_svc.predict_proba(x_test)

roc_auc_score(y_test, preds, multi_class='ovo')

In [None]:
from yellowbrick.classifier import ROCAUC
from yellowbrick.datasets import load_spam

visualizer = ROCAUC(scaled_svc, macro=False, classes=classes)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
visualizer.show()

In [None]:
# pr statistics - compare weighted squared f1 not accuracy across models
from yellowbrick.classifier import ClassificationReport

visualizer = ClassificationReport(scaled_svc, classes=classes, support=True)

visualizer.fit(x_train, y_train)        # Fit the visualizer and the model
visualizer.score(x_test, y_test)        # Evaluate the model on the test data
visualizer.show()

In [None]:
# pr curve
from yellowbrick.classifier import PrecisionRecallCurve

visualizer = PrecisionRecallCurve(scaled_svc, classes=classes)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
visualizer.show()

In [None]:
# look at the predictions
from yellowbrick.classifier import ClassPredictionError

# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(scaled_svc, classes=classes)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
visualizer.show()

In [None]:
from yellowbrick.model_selection import RFECV

# Instantiate RFECV visualizer with a linear SVM classifier
visualizer = RFECV(SVC(kernel='linear', C=1))

visualizer.fit(x_test, y_test)
visualizer.show()

# Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

C = 1
# C = 1000000

model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVR(C=C))
])

model.fit(x_train, y_train)

In [None]:
import seaborn as sns
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

preds = model.predict(x_test)

sns.distplot(preds)

In [None]:
from yellowbrick.regressor import ResidualsPlot

visualizer = ResidualsPlot(model)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
visualizer.show()