<a href="https://colab.research.google.com/github/nisanuro/CNG562-Assignment-3/blob/master/CNG562_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn import metrics, datasets, preprocessing
from sklearn.datasets import load_breast_cancer 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [0]:
def correlation_map(df):
    plt.figure(figsize=(20,12)) 

    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='hot')
    plt.show()

In [0]:
def filter_features(data, feature_indexes):
		# eliminate above column indices from the data and return new set
		filtered_data = np.delete(data, feature_indexes, axis=1)

		return filtered_data

In [0]:
def fourError(X, Y, model, r, future_scaling):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = r, stratify=Y)
    
    if(future_scaling):
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
    
    Train_x, TrainDev_x, Train_y, TrainDev_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)
    Dev_x, Test_x, Dev_y, Test_y = train_test_split(X_test, Y_test, test_size=0.5, random_state=0, stratify=Y_test)

    model.fit(Train_x, Train_y)

    y_true, trainDev_pred = TrainDev_y, model.predict(TrainDev_x)

    print("\nTrain-Train Dev,   e1:", metrics.mean_squared_error(TrainDev_y, trainDev_pred))
    print("Accuracy: ", 1 - metrics.mean_squared_error(TrainDev_y, trainDev_pred),"\n")

    y_true, dev_pred = Dev_y, model.predict(Dev_x)
    print("Train-Dev,   e2", metrics.mean_squared_error(Dev_y, dev_pred))
    print("Accuracy: ", 1 - metrics.mean_squared_error(Dev_y, dev_pred),"\n")

    y_true, test_pred = Test_y, model.predict(Test_x)
    print("Train-Test,   e3: ", metrics.mean_squared_error(Test_y, test_pred))
    print("Accuracy: ", 1 - metrics.mean_squared_error(Test_y, test_pred),"\n")

    y_true, devTest_pred = Y_test, model.predict(X_test)
    print("Train-(Dev+Test),   e4: ", metrics.mean_squared_error(Y_test, devTest_pred))
    print("Accuracy: ", 1 - metrics.mean_squared_error(Y_test, devTest_pred),"\n")

In [0]:
def vis_all_feat(data, class_):
    for col_ind in range(data.shape[1]):
		    print("Viewing Feature #{0}".format(str(col_ind)))
		    vis_single_feat(data, class_, col_ind)

In [0]:
def vis_single_feat(data, class_, ind):
	  # create graph of classification and feature values	
	  plt.figure(100) # display two plots on separate figures
	  df = pd.DataFrame(data)
	  feat_vals = df.iloc[:, ind]
	  plt.scatter(feat_vals, class_)
	  plt.title("Plot of Feature {0}".format(str(ind)))
	  plt.xlabel("Feature Value")
	  plt.ylabel("Classification")
  
	  # create bar graph of mean feature values for each classification
	  plt.figure(200)
	  plt.title("Mean Values of Feature {0}".format(str(ind)))
	  plt.xlabel("Classification")
	  plt.ylabel("Mean Feature Value")
	  mean_df = pd.concat([df.iloc[:, ind], pd.Series(class_)], axis=1)
	  mean_df.columns = ["values", "classif"]	
	  mean_df.groupby("classif", as_index=False)["values"].mean().loc[:,"values"].plot(kind='bar')
  
	  plt.show()

In [0]:
def dbScan(X, Y, eps = 0.5, min = 5, distance = "euclidean"):
    dbs = DBSCAN(eps=eps, min_samples=min, metric=distance, algorithm="brute", n_jobs=-1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)
    
    pred = dbs.fit_predict(X_train)

    score = silhouette_score(X_train, pred)

    print("Score: {}".format(score))
    #print(confusion_matrix(Y_train, pred))

In [0]:
def optimize_eps():
    neigh = NearestNeighbors(n_neighbors=4)
    nbrs = neigh.fit(X)
    distances, indices = nbrs.kneighbors(X)

    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    plt.plot(distances)

In [0]:
def optimize_model(X, Y):
    eps = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    for i in eps:
        dbScan(X, Y, eps=i)
    #eps = 0.8
    
    #for i in range(1, 7):
    #    dbScan(X, Y, eps=0.6, min = i)
    #min = 2,3
    
    metric = ["euclidean", "manhattan"]

    #for i in metric:
    #    dbScan(X, Y, eps=0.8, min=1, distance=i)
    #euclidean, chebyshev

In [0]:
def acc(X_train, X_test, Y_train, Y_test):
    model = DBSCAN(eps=0.8, min_samples=2, metric="euclidean", algorithm="brute", n_jobs=-1)

    #pred = model.predict(X_test)
    #print(confusion_matrix(Y_test, pred))
    prediction = model.fit_predict(X_train)

    correct = 0
    for i in range(len(Y_train)):
        if prediction[i] == Y_train[i]:
            correct += 1

    print(correct/len(X_train))

In [233]:
if __name__ == '__main__':

    breast_cancer = datasets.load_breast_cancer()
    X = breast_cancer.data
    Y = breast_cancer.target
    
    X = filter_features(X, [2, 3, 20, 22, 23, 12, 13])

    #vis_all_feat(X, Y)
    X = filter_features(X, [1, 2, 6, 7, 9, 10, 14, 15])
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    #optimize_eps()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
    #acc(X_train, X_test, Y_train, Y_test)
    optimize_model(X, Y)

Score: 0.4394216507301296
Score: 0.5101290973488284
Score: 0.5101290973488284
Score: 0.5101290973488284
Score: 0.5101290973488284
Score: 0.5101290973488284
