In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import datasets
from math import sqrt

import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.optimize as opt
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
sns.set_style("ticks")
%matplotlib inline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score

In [None]:
fs_data = pd.read_csv('../data/clean/bcdr_d0G_medc.csv')
fs_data.head()

In [None]:
x_values = fs_data[
    ['mammography_nodule', 'i_mean',     'i_skewness', 's_x_center_mass',
    's_y_center_mass',    's_solidity', 's_extent',   't_corr','t_homo', 't_senth']]
y_values = fs_data["diagnosis"]
y_values.value_counts()

In [None]:
# Data Split
from sklearn.model_selection import train_test_split
seed = 1234
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3, random_state=seed)

## Metrics Used

In [None]:
def print_metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred=y_pred)
    sns.heatmap(cm, annot=True)
    target_names = ['0', '1']
    print(classification_report(y_test, y_pred, target_names=target_names))
    print('Accuracy: {0}'.format(accuracy_score(y_test, y_pred)))
    
def plot_roc(y_test, y_pred, model):
    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    plt.plot([0, 1], [0, 1], color='red', linestyle='--')
    plt.plot(fpr, tpr, marker='.', label = 'AUC: %0.2f' % auc, color= 'green')
    plt.title('{0} (ROC + AUC)'.format(model))
    plt.xlabel('False Positive Rate (x)')
    plt.ylabel('True Positive Rate (y)')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

## Logistic Regression (No Normalized)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_pred = lr_model.predict(x_test)
print_metrics(y_test, y_pred)

In [None]:
plot_roc(y_test, y_pred, 'Logistic Regression - NN')

## Logistic Regression (Normalized Z-Score)

In [None]:
from sklearn import preprocessing
from scipy.stats import zscore
fs_data_n = fs_data.drop(columns=["diagnosis"]).apply(zscore)
fs_data_n['diagnosis'] = fs_data['diagnosis']
fs_data_n.head()

In [None]:
nx_values = fs_data_n[
    ['mammography_nodule', 'i_mean',     'i_skewness', 's_x_center_mass',
    's_y_center_mass',    's_solidity', 's_extent',   't_corr','t_homo', 't_senth']]
ny_values = fs_data_n["diagnosis"]
nx_values = fs_data_n.drop(columns=['diagnosis']) 
ny_values = fs_data_n['diagnosis']
nx_train, nx_test, ny_train, ny_test = train_test_split(nx_values, ny_values, test_size=0.3, random_state=seed)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(nx_train, ny_train)
ny_pred = lr_model.predict(nx_test)
print_metrics(ny_test, ny_pred)

In [None]:
plot_roc(ny_test, ny_pred, 'Logistic Regression(Normalized)')

## Learning Curves w/Cross Validation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
title = "Learning Curves (Logistic Regression NN)"
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=0)

estimator = LogisticRegression()
plot_learning_curve(estimator, title, x_values, y_values, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

In [None]:
title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=0)

estimator = LogisticRegression()
plot_learning_curve(estimator, title, nx_values, ny_values, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

# Polinomial Degree curve

In [None]:
def plot_curve(train_error, test_error):
    plt.plot(np.arange(0,train_error.shape[0]-1), train_error[1:], label = 'train error')
    plt.plot(np.arange(0,train_error.shape[0]-1),test_error[1:], label = 'test error', color= 'green')
    plt.title('Training Set')
    plt.xlabel('Polynomial Complexity (x)')
    plt.ylabel('MSE (y)')
    plt.grid()
    plt.legend()
    plt.show() 

    
def obtain_error(x_train, y_train, x_test, y_test):
    m, n = x_train.shape
    train_error = 0
    test_error = 0
    hyp_model = None
    cls_ = LogisticRegression().fit(x_train, y_train)
    hyph_train = cls_.predict(x_train)
    hyph_test = cls_.predict(x_test)
    train_error = mean_squared_error(y_train, hyph_train)
    test_error  = mean_squared_error(y_test, hyph_test)
    return train_error, test_error


def create_curve(fs_data, seed):
    train, test = train_test_split(fs_data, test_size=0.3, random_state=seed)
    
    nx_train = train.iloc[:,:-1].values
    ny_train = train.iloc[:,-1].values
    # ---- 
    nx_test = test.iloc[:,:-1].values
    ny_test = test.iloc[:,-1].values
    # ------
    train_error = np.zeros(nx_train.shape[1])
    test_error = np.zeros(nx_train.shape[1])

    n = nx_train.shape[1]
    for i in range(1, n):
        train_error[i], test_error[i] = obtain_error(nx_train[:,0:i], ny_train, nx_test[:,0:i], ny_test)
    plot_curve(train_error, test_error)

In [None]:
new_data = fs_data[
    ['mammography_nodule', 'i_mean',     'i_skewness', 's_x_center_mass',
    's_y_center_mass',    's_solidity', 's_extent',   't_corr','t_homo', 't_senth',
    'diagnosis']]
create_curve(new_data, seed)
print(new_data.shape)

In [None]:
nnew_data = fs_data_n[
    ['mammography_nodule', 'i_mean',     'i_skewness', 's_x_center_mass',
    's_y_center_mass',    's_solidity', 's_extent',   't_corr','t_homo', 't_senth',
    'diagnosis']]
create_curve(nnew_data, seed)