# Project 3
Who knew we'd survive this long?

## K-Means Clustering

In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import itertools
import operator
from sklearn import datasets

In [12]:
scaler = MinMaxScaler()

def scale_and_train(seed, sample_size, data, n_clusters, dataset, labels=None):
    if dataset == 'bc':
        x = data.loc[:, data.columns != 'diagnosis']
        y = np.where(bc_data['diagnosis']=='M', 1, 0)
    else:
        x = data
        y = labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=sample_size, random_state = seed)
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    km = KMeans(n_clusters=n_clusters, random_state=seed).fit(x_train_scaled)
    return km, x_train_scaled, x_test_scaled, y_train, y_test

In [3]:
def error_indices(predictions, y_test):
    diverging_indecies = []
    for i in range(len(predictions)):
        if predictions[i] != y_test[i]:
            diverging_indecies.append({'index': i, 'p': predictions[i], 'a': y_test[i]})
    return diverging_indecies

### Breast Cancer
This section analyzes the breast cancer dataset.

In [4]:
bc_data = pd.read_csv("../resources/breast-cancer/wdbc.data.csv")

In [13]:
bc_km, bc_x_train_scaled, bc_x_test_scaled, bc_y_train, bc_y_test = scale_and_train(12345, .65, bc_data, 2, 'bc')


In [14]:
bc_km.labels_

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0], dtype=int32)

In [15]:
bc_predictions = bc_km.predict(bc_x_test_scaled)

In [16]:
bc_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,

In [17]:
bc_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,

In [18]:
np.sum((bc_predictions-bc_y_test)**2)

33

In [19]:
bc_diverging_indecies = error_indices(bc_predictions, bc_y_test)
bc_diverging_indecies

[{'index': 19, 'p': 0, 'a': 1},
 {'index': 20, 'p': 0, 'a': 1},
 {'index': 41, 'p': 0, 'a': 1},
 {'index': 42, 'p': 0, 'a': 1},
 {'index': 49, 'p': 0, 'a': 1},
 {'index': 56, 'p': 0, 'a': 1},
 {'index': 64, 'p': 0, 'a': 1},
 {'index': 75, 'p': 0, 'a': 1},
 {'index': 78, 'p': 1, 'a': 0},
 {'index': 82, 'p': 0, 'a': 1},
 {'index': 89, 'p': 0, 'a': 1},
 {'index': 95, 'p': 0, 'a': 1},
 {'index': 113, 'p': 0, 'a': 1},
 {'index': 128, 'p': 1, 'a': 0},
 {'index': 131, 'p': 1, 'a': 0},
 {'index': 133, 'p': 0, 'a': 1},
 {'index': 140, 'p': 0, 'a': 1},
 {'index': 152, 'p': 1, 'a': 0},
 {'index': 165, 'p': 0, 'a': 1},
 {'index': 166, 'p': 1, 'a': 0},
 {'index': 179, 'p': 1, 'a': 0},
 {'index': 186, 'p': 0, 'a': 1},
 {'index': 203, 'p': 0, 'a': 1},
 {'index': 212, 'p': 0, 'a': 1},
 {'index': 224, 'p': 0, 'a': 1},
 {'index': 250, 'p': 1, 'a': 0},
 {'index': 268, 'p': 0, 'a': 1},
 {'index': 271, 'p': 0, 'a': 1},
 {'index': 281, 'p': 1, 'a': 0},
 {'index': 309, 'p': 0, 'a': 1},
 {'index': 312, 'p': 0

In [20]:
len(bc_predictions)


370

prediction accuracy

In [21]:
(len(bc_predictions)-len(bc_diverging_indecies))/len(bc_predictions)



0.9108108108108108

### Iris Dataset
This section analyzes the Iris Dataset

In [26]:
ir_data_map = datasets.load_iris()
ir_data = pd.DataFrame(data=ir_data_map.data, columns=ir_data_map.feature_names)

In [27]:
ir_km, ir_x_train_scaled, ir_x_test_scaled, ir_y_train, ir_y_test = scale_and_train(12345, .65, ir_data, 3, 'ir', labels=ir_data_map.target)

In [39]:
ir_predictions_prime = ir_km.predict(ir_x_test_scaled)
ir_predictions = []

for i in range(len(ir_predictions_prime)):
    if ir_predictions_prime[i] == 0:
        ir_predictions.append(1)
    elif ir_predictions_prime[i] == 1:
        ir_predictions.append(0)
    else:
        ir_predictions.append(ir_predictions_prime[i])

In [40]:
ir_diverging_indecies = error_indices(ir_predictions, ir_y_test)

In [41]:
ir_diverging_indecies

[{'index': 9, 'p': 1, 'a': 2},
 {'index': 29, 'p': 1, 'a': 2},
 {'index': 32, 'p': 1, 'a': 2},
 {'index': 33, 'p': 1, 'a': 2},
 {'index': 43, 'p': 1, 'a': 2},
 {'index': 49, 'p': 2, 'a': 1},
 {'index': 77, 'p': 2, 'a': 1},
 {'index': 88, 'p': 2, 'a': 1},
 {'index': 96, 'p': 1, 'a': 2}]

In [45]:
(len(ir_predictions)-len(ir_diverging_indecies))/len(ir_predictions)

0.9081632653061225