In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler as ss
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
import operator
import math
from sklearn.metrics import f1_score
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
#***************************************************Feature Selection Method************************************************
def correlation_filter(data_duplicate, threshold):
    """
    :param data_duplicate : copied version of the original data-set containing the features of the cancerous and non cancerous cells
    :param threshold : parameter around which the non correlated and highly correlated features are distinguished.
    :return data_duplicate : new data-set which has only the filtered non correlated features

    """
    col_corr = set()  # Set of all the names of deleted columns
    corr_matrix = data_duplicate.corr()
    # Loop through the data to remove those corelation values less than the threshold
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
                if colname in data_duplicate.columns:
                    del data_duplicate[colname]  # deleting the column from the dataset

    corr_matrix_newdata = data_duplicate.corr()
    attribute_names_newdata = corr_matrix_newdata.columns.tolist()
    #Figure to see the heatmap of filtered features
    figure_2 = plt.figure(figsize=(5, 5))
    plt.title('Correlation between reduced features', fontsize=20)
    new_feature_data = sns.heatmap(corr_matrix_newdata, vmin=0, vmax=1, annot=True, annot_kws={"size": 6}, fmt='.2g',
                                   cmap='PiYG',
                                   linewidth=2, linecolor='black', cbar='True', xticklabels=attribute_names_newdata,
                                   yticklabels=attribute_names_newdata)
    new_feature_data.set_xticklabels(
        new_feature_data.get_xticklabels(),
        rotation=45,
        horizontalalignment='right',
        fontweight='light',
        fontsize=7)
    new_feature_data.set_yticklabels(
        new_feature_data.get_yticklabels(),
        fontweight='light',
        fontsize=7)
    x, y = plt.ylim()  # get the values for bottom and top
    x += 0.75  # Add 0.5 to the bottom
    y -= 0.75  # Subtract 0.5 from the top
    plt.ylim(x, y)
    plt.show()
    # saving image in the give path
    figure_2.savefig(r'C:\Users\Ren\Desktop\Test2_heatmap.png')
    return data_duplicate

In [3]:
#*************************************************Random Forest Classifier***************************************************

def RandomForest_Classifier(x_train, x_test, y_train, y_test):
    # OPTION 1: Using default RF classifier setting
    classifier_default = RandomForestClassifier()
    accuracy_default = classifier_default.fit(x_train, y_train)
    y_pred_default = classifier_default.predict(x_test)
    cm_default = confusion_matrix(y_test, y_pred_default)
    classifier_default_TP = cm_default[0][0]
    classifier_default_TN = cm_default[1][1]
    classifier_default_FN = cm_default[1][0]
    classifier_default_FP = cm_default[0][1]
    classifier_default_Accuracy =(classifier_default_TP + classifier_default_TN) / (classifier_default_TP + classifier_default_TN + classifier_default_FN + classifier_default_FP)
    classifier_default_Precision = (classifier_default_TP) / (classifier_default_TP + classifier_default_FP)

    RFC_Parameter_Tuning = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                                            max_depth=None, max_features='auto', max_leaf_nodes=None,
                                            min_impurity_split=None, min_samples_leaf=2,
                                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                                            n_jobs=None, oob_score=False, random_state=None,
                                            n_estimators=10, verbose=0, warm_start=False)


    RFC_Parameter_Tuning_Fit = RFC_Parameter_Tuning.fit(x_train, y_train)
    RFC_Parameter_Tuning_Y_Pred = RFC_Parameter_Tuning.predict(x_test)
    RFC_Parameter_Tuning_CM = confusion_matrix(y_test, RFC_Parameter_Tuning_Y_Pred)
    RFC_Parameter_Tuning_TP = RFC_Parameter_Tuning_CM[0][0]
    RFC_Parameter_Tuning_TN = RFC_Parameter_Tuning_CM[1][1]
    RFC_Parameter_Tuning_FN = RFC_Parameter_Tuning_CM[1][0]
    RFC_Parameter_Tuning_FP = RFC_Parameter_Tuning_CM[0][1]

    RFC_Parameter_Tuning_Accuracy = (RFC_Parameter_Tuning_TP + RFC_Parameter_Tuning_TN) / (RFC_Parameter_Tuning_TP + RFC_Parameter_Tuning_TN + RFC_Parameter_Tuning_FN + RFC_Parameter_Tuning_FP)
    RFC_Parameter_Tuning_Precision =(RFC_Parameter_Tuning_TP) / (RFC_Parameter_Tuning_TP + RFC_Parameter_Tuning_FP)


    Dictionary2 = {}
    Dictionary2["Default"] = classifier_default_Accuracy
    Dictionary2["Parameter_Tuning"] = RFC_Parameter_Tuning_Accuracy

    import operator
    a = max(Dictionary2.items(), key=operator.itemgetter(1))[0]
    print("\nConfusion Matrix for RFC\n", RFC_Parameter_Tuning_CM)
    plt.title('Plot showing the accuracy for each Options of RFC')
    plt.bar(range(len(Dictionary2)), list(Dictionary2.values()), align='center')
    plt.xticks(range(len(Dictionary2)), list(Dictionary2.keys()))
    plt.xlabel('Different Options used in Random Forest Classifier')
    plt.ylabel(' Accuracy')
    plt.show()
    sns.heatmap(RFC_Parameter_Tuning_CM, annot=True, cmap='YlOrRd')
    plt.title('Confusion Matrix for Random Forest Classifier', fontsize=20)
    x, y = plt.ylim()  # get the values for bottom and top
    x += 0.75  # Add 0.5 to the bottom
    y -= 0.75  # Subtract 0.5 from the top
    plt.ylim(x, y)
    plt.show()
    return Dictionary2[a],RFC_Parameter_Tuning_Precision,a,RFC_Parameter_Tuning_CM

In [5]:
#*********************************************************SVM***************************************************************
def KSVM_Classifier(X_train, X_test, y_train, y_test):

   # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=0)
    SVC_RBF = SVC(kernel='rbf')
    SVC_RBF.fit(X_train,y_train)
    SVC_RBF_Y_Pred = SVC_RBF.predict(X_test)
    SVC_RBF_cm = (confusion_matrix(y_test, SVC_RBF_Y_Pred))
    SVC_RBF_TP = SVC_RBF_cm[0][0]
    SVC_RBF_TN = SVC_RBF_cm[1][1]
    SVC_RBF_FN = SVC_RBF_cm[1][0]
    SVC_RBF_FP = SVC_RBF_cm[0][1]
    SVC_RBF_accuracy = float(SVC_RBF_cm.diagonal().sum()) / len(y_test)
    SVC_RBF_Precision = (SVC_RBF_TP) / (SVC_RBF_TP + SVC_RBF_FP)

    SVC_poly = SVC(kernel='poly', degree=8)
    SVC_poly.fit(X_train,y_train)
    SVC_poly_Y_Pred = SVC_poly.predict(X_test)

    SVC_poly_cm = (confusion_matrix(y_test, SVC_poly_Y_Pred))
    SVC_poly_accuracy = float(SVC_poly_cm.diagonal().sum()) / len(y_test)
    #print("Accuracy using Polynomial kernel:", accuracy_poly)
    SVC_poly_TP = SVC_poly_cm[0][0]
    SVC_poly_TN = SVC_poly_cm[1][1]
    SVC_poly_FN = SVC_poly_cm[1][0]
    SVC_poly_FP = SVC_poly_cm[0][1]
    SVC_poly_Precision = (SVC_poly_TP) / (SVC_poly_TP + SVC_poly_FP)

    SVC_sig = SVC(kernel='sigmoid')
    SVC_sig.fit(X_train,y_train)
    SVC_sig_Y_Pred = SVC_sig.predict(X_test)
    SVC_sig_cm = (confusion_matrix(y_test, SVC_sig_Y_Pred))
    SVC_sig_accuracy = float(SVC_sig_cm.diagonal().sum()) / len(y_test)
    SVC_sig_TP = SVC_sig_cm[0][0]
    SVC_sig_TN = SVC_sig_cm[1][1]
    SVC_sig_FN = SVC_sig_cm[1][0]
    SVC_sig_FP = SVC_sig_cm[0][1]
    SVC_sig_Precision = (SVC_sig_TP) / (SVC_sig_TP + SVC_sig_FP)

    Dictionary1 = {}
    Dictionary1["rbf"] = SVC_RBF_accuracy
    Dictionary1["sigmoid"] = SVC_sig_accuracy
    Dictionary1["poly"] = SVC_poly_accuracy

    SVC_max_accuracy = max(Dictionary1.items(), key=operator.itemgetter(1))[0]
    print("\nConfusion matrix for kSVM\n ",SVC_RBF_cm )
    plt.title('Plot showing the accuracy for each kernal model')
    plt.bar(range(len(Dictionary1)), list(Dictionary1.values()), align='center')
    plt.xticks(range(len(Dictionary1)), list(Dictionary1.keys()))
    plt.xlabel('Different Kernels used in SVM')
    plt.ylabel(' Accuracy')
    plt.show()
    sns.heatmap(SVC_RBF_cm, annot=True, cmap='YlOrRd')
    plt.title('Confusion Matrix for KSVM', fontsize=20)
    x, y = plt.ylim()  # get the values for bottom and top
    x += 0.75  # Add 0.5 to the bottom
    y -= 0.75  # Subtract 0.5 from the top
    plt.ylim(x, y)
    plt.show()
    return Dictionary1[SVC_max_accuracy],SVC_RBF_Precision,SVC_max_accuracy,SVC_RBF_cm


In [6]:
#**********************************************************KNN**************************************************************
def KNN_Classifier(x_train, x_test, y_train, y_test):


    # using knn classifier to predict the data
    kneighbors_knn = []
    cross_validate_knn_scores = []
    cross_validate_knn_scores_2 = []
    cross_validate_knn_scores_3 = []
    score_knn = []
    accuracy_knn = []
    confusion_mat_knn = []
    knn_y_prediction = []

    """
    The below for loop is used  to find the best value of 'K' neighbour for the dataset. 
    This is done by finding the accuracy of the classifier at different values of K and then cross validating it.
    """

    for i in range(1, 50, 2):
        # applying classifier at different i
        kneighbors_knn.append(i)
        knn_1 = KNeighborsClassifier(n_neighbors=i)
        knn_1.fit(x_train, y_train)
        predict_knn_1 = knn_1.predict(x_test)
        knn_y_prediction.append(predict_knn_1)

        # finding the accuracy of the classifier each time the vlaue of i/K neighbour changes.
        accuracy_knn.append(accuracy_score(y_test, knn_1.predict(x_test)))
        scorees = knn_1.score(x_test, y_test)

        # Using confusion matrix to observe the falseness and correctness of the predicted results
        cm = confusion_matrix(y_test, predict_knn_1)
        # storing different confusion matrix in one list
        confusion_mat_knn.append(cm)
        score_knn.append(scorees)

        # cross validate the correct value of k
        scores = cross_val_score(knn_1, x_train, y_train, cv=10, scoring='accuracy')
        cross_validate_knn_scores.append(scores.mean())
        scores_2 = cross_val_score(knn_1, x_train, y_train, cv=15, scoring='accuracy')
        cross_validate_knn_scores_2.append(scores_2.mean())
        scores_3 = cross_val_score(knn_1, x_train, y_train, cv=20, scoring='accuracy')
        cross_validate_knn_scores_3.append(scores_3.mean())

    """
        in the below output the K value for cross validtion is same at K=3,7,13. Hence we can choose any one for the value of K.
        When we further change the cv to 15, we get K=13 again.Hence showing that K at 13 is correct value.

    """
    knn_range = range(1, 50, 2)
    figure_4 = plt.figure(figsize=(5, 5))
    plt.title('Relation between K and corresponding accuracy of KNN model', fontsize=20)
    plt.plot(knn_range, accuracy_knn,color='#4b0082')
    plt.xlabel('Value of K for KNN')
    plt.ylabel(' Accuracy')
    #plt.show()
    plt.savefig(r'C:\Users\Ren\Desktop\Test2_fig5_accuracy.png')
    c = ((accuracy_knn.index(max(accuracy_knn))) * 2) + 1
    KNN_Classifier_Cancer = KNeighborsClassifier(n_neighbors=c, metric='minkowski', p=2)
    KNN_Classifier_Cancer.fit(x_train, y_train)
    KNN_y_pred = KNN_Classifier_Cancer.predict(x_test)
    score_Knn_cancer = KNN_Classifier_Cancer.score(x_test, y_test)

    KNN_Confusion_Matrix = confusion_matrix(y_test,KNN_y_pred)
    print('\nConfusion Matrix for KNN\n', KNN_Confusion_Matrix)
    KNN_True_Positive = KNN_Confusion_Matrix[0][0]
    KNN_True_Negative = KNN_Confusion_Matrix[1][1]
    KNN_False_Positive = KNN_Confusion_Matrix[0][1]
    KNN_False_Negative = KNN_Confusion_Matrix[1][0]

    KNN_accuracy = (KNN_True_Positive + KNN_True_Negative) / (KNN_True_Positive + KNN_True_Negative + KNN_False_Positive + KNN_False_Negative)
    precision_knn =KNN_True_Positive / (KNN_True_Positive + KNN_False_Positive)

    figure_5 = plt.figure(figsize=(10, 10))
    sns.heatmap(KNN_Confusion_Matrix, annot=True, cmap='YlOrRd')
    plt.title('Confusion Matrix for KNN Classifier', fontsize=20)
    x, y = plt.ylim()  # get the values for bottom and top
    x += 0.75  # Add 0.5 to the bottom
    y -= 0.75  # Subtract 0.5 from the top
    plt.ylim(x, y)
    plt.show()
    figure_5 = plt.savefig(r'C:\Users\Ren\Desktop\Test2_fig6_knn_confusionMatrix.png')
    return KNN_accuracy,precision_knn,c,KNN_Confusion_Matrix

def SVM_Classifier(X_train, X_test, y_train, y_test):
    # Linear Kernel
    SVM_Classifier = svm.SVC(kernel='linear')
    # Training the model using the training sets
    SVM_Classifier.fit(X_train, y_train)
    # Predicting the response for test sets
    SVM_Classifier_Y_Pred = SVM_Classifier.predict(X_test)
    # Evaluating the model
    SVM_Classifier_CM = (confusion_matrix(y_test, SVM_Classifier_Y_Pred))
    SVM_Classifier_accuracy = float(SVM_Classifier_CM.diagonal().sum()) / len(y_test)
    sns.heatmap(SVM_Classifier_CM, annot=True, cmap='YlOrRd')
    plt.title('Confusion Matrix for SVM', fontsize=20)
    x, y = plt.ylim()  # get the values for bottom and top
    x += 0.75  # Add 0.5 to the bottom
    y -= 0.75  # Subtract 0.5 from the top
    plt.ylim(x, y)
    plt.show()
    # Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics
    # Calculating the model accuracy to check how often classifier is correct
    print("\nConfusion Matrix for SVM \n",SVM_Classifier_CM)

    return SVM_Classifier_accuracy,metrics.precision_score(y_test, SVM_Classifier_Y_Pred),"Linear",SVM_Classifier_CM

SyntaxError: invalid syntax (<ipython-input-6-205f8b415f8e>, line 1)

In [7]:
#*****************************************************ReportPrint*************************************************************
def Report(list_accuracy):
    dash = '-' * 70
    for m in range(len(list_accuracy)):
        if m == 0:
            print(dash)
            print('{:<10s}{:>14s}{:>12s}{:>12s}'.format(list_accuracy[m][0], list_accuracy[m][1],
                                                              list_accuracy[m][2], list_accuracy[m][3]))
            print(dash)
        else:
            print('{:<10s}{:>15s}{:^22.6f}{:^2.5f}'.format(list_accuracy[m][0], list_accuracy[m][1],
                                                                  list_accuracy[m][2],
                                                                  list_accuracy[m][3]))


In [None]:
# ************************************************Main Code**************************************************
# Read the featured dataset
Original_Dataset = pd.read_csv('wdbc_data.csv')

# drop the ID column from the data-set
Original_Dataset=Original_Dataset.drop(['ID'], axis=1)
# converting the Categorical values to Binary
Original_Dataset['Diagnosis'].replace('B', 0, inplace=True)
Original_Dataset['Diagnosis'].replace('M', 1, inplace=True)

# creating duplicate copy of the original data-set
Original_Dataset_duplicate = Original_Dataset.copy()
# finding the correlation between all the features in the data-set 1 or the original data-set
correlation_ds1 = Original_Dataset.corr()
attribute_names = correlation_ds1.columns.tolist()
# figure_1 represents the heatmap version of the correlation between all the features in the data-set
Figure_1 = plt.figure(figsize=(60, 60))
Figure_1.suptitle('Correlation between all the features', fontsize=10)
# plotting heatmap between the features by passing the correlated values of the features to the heatmap
Feature_correlation_ds1 = sns.heatmap(correlation_ds1,
                                      vmin=0, vmax=1, annot=True, annot_kws={"size": 6}, fmt='.2g',
                                      cmap='winter', linewidth=2, linecolor='black', cbar='True',
                                      xticklabels=attribute_names, yticklabels=attribute_names)
Feature_correlation_ds1.set_xticklabels(
    Feature_correlation_ds1.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    fontweight='light',
    fontsize=7)
Feature_correlation_ds1.set_yticklabels(
    Feature_correlation_ds1.get_yticklabels(),
    fontweight='light',
    fontsize=7)
b, t = plt.ylim()  # getting data of the bottom and top of the matrix frame
b += 0.75  # adjusting the bottom range so that the data is not displayed out of the matrix size
t -= 0.75  # adjusting the top range so that the data is not displayed out of the matrix size
plt.ylim(b, t)
plt.show()
# saving image in the given path
Figure_1.savefig(r'C:\Users\Ren\Desktop\Test2_fig1_th55.png')

# calling function which will filter the dataset and eliminate highly correlated features based on the input threshold value
Filtered_Dataset = correlation_filter(Original_Dataset_duplicate, 0.55)

# saving the filtered data-set as excel format
Filtered_Dataset.to_csv(r'C:\Users\Ren\Desktop\Test3_filtered_hm.csv')


# Filter_data2 and Filtered_Dataset contains reduced parameters
filtered_dataset = pd.read_csv(r'C:\Users\Ren\Desktop\Test3_filtered_hm.csv')


# splitting dataset into features and output
features_data = filtered_dataset.iloc[:, 2:10]
output_data = filtered_dataset.iloc[:, 1]

# Split the features_data and output_data into test and train
X_train, X_test, y_train, y_test = train_test_split(features_data,output_data, test_size=0.30, random_state=0)

scale_data = StandardScaler()
X_train = scale_data.fit_transform(X_train)
X_test = scale_data.transform(X_test)
#calling knn classifier

CM_KNN= []
CM_SVM= []
CM_KSVM= []
CM_RFC= []
#calling knn classifier
Accuracy_KNN,Precision_KNN,Parameter_KNN,CM_KNN= KNN_Classifier(X_train, X_test, y_train, y_test)
#calling SVM classifier
Accuracy_SVM,Precision_SVM,Parameter_SVM,CM_SVM= SVM_Classifier(X_train, X_test, y_train, y_test)
#calling kSVM classifier
Accuracy_KSVM,Precision_KSVM,Parameter_KSVM,CM_KSVM= KSVM_Classifier(X_train, X_test, y_train, y_test)
#calling RFC classifier
Accuracy_RFC,Precision_RFC,Parameter_RFC,CM_RFC= RandomForest_Classifier(X_train, X_test, y_train, y_test)

list_accuracy=[['Classifier_Name','Parameter/Method','Accuracy','Precision'],['SVM','Kernel='+str(Parameter_KSVM),Accuracy_KSVM,Precision_KSVM],['KNN','K='+str(Parameter_KNN),Accuracy_KNN,Precision_KNN],['RFC','K='+str(Parameter_RFC),Accuracy_RFC,Precision_RFC],['SVM','K='+str(Parameter_SVM),Accuracy_SVM,Precision_SVM]]

Report(list_accuracy)
