In [1]:
#Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
def get_w (X_train, y_train, X_test, classifiers, probability=False):
    """Trains different models using classifiers in "classifiers" on X_train, y_train, and builds w_train and w_test"""
    """If probability=True, it uses classifier.predict_proba"""
    
    w_train, w_test = None, None
    
    for clf in classifiers:
        clf.fit(X_train, y_train)
        
        if probability:
            y_pred_train = clf.predict_proba (X_train)[:, 0]
            y_pred_test = clf.predict_proba (X_test)[:, 0]
        else:    
            y_pred_train = clf.predict (X_train)
            y_pred_test = clf.predict (X_test)
        
        y_pred_train = np.reshape (y_pred_train, (-1, 1))
        y_pred_test = np.reshape (y_pred_test, (-1, 1))
        
        if w_train is None:
            w_train = y_pred_train
            w_test = y_pred_test
        else:
            w_train = np.concatenate ((w_train, y_pred_train), axis=1)
            w_test = np.concatenate ((w_test, y_pred_test), axis=1)
            
    return (w_train, w_test)

In [3]:
def get_w_metrics (clf, X_train, y_train, X_test, y_test, probability=False):
    """"Gives confusion matrix and f1_score obtained from having a classifier (clf) employ "w" to predict y_test"""
    """It passes "probability" to get_w"""
    
    classifiers = [LinearDiscriminantAnalysis(), LogisticRegression(), svm.SVC(probability=True), tree.DecisionTreeClassifier()]
    w_train, w_test = get_w (X_train, y_train, X_test, classifiers, probability=probability)

    clf.fit(w_train, y_train)
    y_pred_test = clf.predict(w_test)
    
    #print ('confusion matrix ,f1 score')
    return (confusion_matrix(y_test, y_pred_test), f1_score(y_test, y_pred_test))

# get_w_metrics on Breast Cancer data:

Let's now apply our code to the breast cancer data using a couple classifiers.

In [4]:
#Import dataset
dataset = pd.read_csv('./BreastCancer.csv')
dataset = dataset.dropna()

X_columns = np.r_[1:6, 8:10]
# x_columns = np.r_[1:10]
X = dataset.iloc[:, X_columns].values 
y = dataset.iloc[:, len(dataset.columns) - 1].values

In [5]:
classifiers = [LinearDiscriminantAnalysis(), LogisticRegression(), svm.SVC(probability=True), tree.DecisionTreeClassifier()]
num_tries = 5
for i in range (num_tries):
    print ('************************************************************************')
    if i == 1: print ('1st',)
    elif i == 2: print ('2nd',)
    elif i == 3: print ('3rd',)
    else: print (i,'th',)
    print ('split of data into training and testing')
    print ('************************************************************************')
    # Splitting data in training and test set
    test_size = 0.1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    for j in range(len(classifiers)):
        clf = classifiers[j]
        if j == 0: print ('LDA -------------------------')
        elif j == 1: print ('LR -------------------------')
        elif j == 2: print ('SVM -------------------------')
        elif j == 3: print ('DT -------------------------')
        print(get_w_metrics (clf, X_train, y_train, X_test, y_test))
        print ( '---------------------------\n')


************************************************************************
0 th
split of data into training and testing
************************************************************************
LDA -------------------------
(array([[47,  3],
       [ 1, 18]], dtype=int64), 0.9)
---------------------------

LR -------------------------
(array([[47,  3],
       [ 1, 18]], dtype=int64), 0.9)
---------------------------

SVM -------------------------
(array([[47,  3],
       [ 2, 17]], dtype=int64), 0.8717948717948718)
---------------------------

DT -------------------------
(array([[47,  3],
       [ 2, 17]], dtype=int64), 0.8717948717948718)
---------------------------

************************************************************************
1st
split of data into training and testing
************************************************************************
LDA -------------------------
(array([[38,  3],
       [ 2, 26]], dtype=int64), 0.912280701754386)
---------------------------

LR ----

There are very few mistakes in predicting y_test! Stacking does an amazing job!

# Does balancing data and/or using probability for classification help?

We would like to see if balancing the data set improves the performance.

In [6]:
def balance(df, target):
    """Gives a balanced data set of a multi-class dataset by adding a random sample of under-represented classes"""
    """Assumes df is a pandas series"""
    
    df_b = df.copy(deep=True)

    max_freq = df[target].value_counts().max()
    
    for i in df[target].unique():
        df_i = df[df[target] == i]
        
        needed_i = max_freq - len(df_i)
        to_add = df_i.sample(n=needed_i)
        
        df_b = pd.concat([df_b,to_add])

    return df_b

In [7]:
def avg_f1_score (X, y, clf, probability=False, num_tries=100):
    
    f1_score_sum = 0
    
    for i in range (num_tries):
        # Splitting data in training and test set
        test_size = 0.1
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        f1_score_sum += get_w_metrics (clf, X_train, y_train, X_test, y_test, probability=probability)[1]

    return f1_score_sum/num_tries

In [8]:
classifiers = [LinearDiscriminantAnalysis(), LogisticRegression(), svm.SVC(probability=True), tree.DecisionTreeClassifier()]

# Construct X, y
X = dataset.iloc[:, X_columns].values 
y = dataset.iloc[:, len(dataset.columns) - 1].values

# Construct X_b, y_b
dataset_b = balance (dataset, 'Class')
X_b = dataset_b.iloc[:, X_columns].values 
y_b = dataset_b.iloc[:, len(dataset_b.columns) - 1].values

for j in range(len(classifiers)):
    clf = classifiers[j]
    if j == 0: print ('LDA -------------------------')
    elif j == 1: print ('LR -------------------------')
    elif j == 2: print ('SVM -------------------------')
    elif j == 3: print ('DT -------------------------')
    print ('average F1 on original dataset vs. balanced dataset when probaility=False')
    print ("%.2f, %.2f" % (avg_f1_score(X, y, clf), avg_f1_score(X_b, y_b, clf)))
    print ('average F1 on original dataset vs. balanced dataset when probaility=True')
    print ("%.2f, %.2f" % (avg_f1_score(X, y, clf, probability=True), avg_f1_score(X_b, y_b, clf, probability=True)))    
    print ( '---------------------------\n')

LDA -------------------------
average F1 on original dataset vs. balanced dataset when probaility=False
0.93, 0.97
average F1 on original dataset vs. balanced dataset when probaility=True
0.93, 0.97
---------------------------

LR -------------------------
average F1 on original dataset vs. balanced dataset when probaility=False
0.91, 0.97
average F1 on original dataset vs. balanced dataset when probaility=True
0.91, 0.97
---------------------------

SVM -------------------------
average F1 on original dataset vs. balanced dataset when probaility=False
0.91, 0.96
average F1 on original dataset vs. balanced dataset when probaility=True
0.91, 0.97
---------------------------

DT -------------------------
average F1 on original dataset vs. balanced dataset when probaility=False
0.92, 0.96
average F1 on original dataset vs. balanced dataset when probaility=True
0.92, 0.97
---------------------------



In all cases, F1 on balanced dataset is higher, however using probability in classification doesn't significantly improve F1-score.
Perhaps a plot would be nice here...!