# Project 3
Who knew we'd survive this long?

## Expectation Maximization
Using sklearn GaussianMixture which implements expectation maximization

In [7]:
from sklearn.mixture import GaussianMixture
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import itertools
import operator
from sklearn import datasets

In [8]:
scaler = MinMaxScaler()

def scale_and_train(seed, sample_size, data, n_mixtures, dataset, labels=None):
    if dataset == 'bc':
        x = data.loc[:, data.columns != 'diagnosis']
        y = np.where(data['diagnosis']=='M', 1, 0)
    else:
        x = data
        y = labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=sample_size, random_state = seed)
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    km = GaussianMixture(n_components=n_mixtures, random_state=seed).fit(x_train_scaled)
    return km, x_train_scaled, x_test_scaled, y_train, y_test

In [39]:

def scale_train_test(seed, sample_size, data, n_mixtures, dataset, labels=None, relable_fn=None):
    em, x_train_scaled, x_test_scaled, y_train, y_test = scale_and_train(seed, sample_size, data, n_mixtures, dataset, labels=labels)
    predictions_prime = em.predict(x_test_scaled)
    
    if relable_fn is not None:
        predictions = relable_fn(predictions_prime)
    else:
        predictions = predictions_prime
        
    diverging_indecies = error_indices(predictions, y_test)
    accuracy = (len(predictions)-len(diverging_indecies))/len(predictions)
    return accuracy

In [9]:
def error_indices(predictions, y_test):
    diverging_indecies = []
    for i in range(len(predictions)):
        if predictions[i] != y_test[i]:
            diverging_indecies.append({'index': i, 'p': predictions[i], 'a': y_test[i]})
    return diverging_indecies

### Breast Cancer
This section analyzes the breast cancer dataset.

In [10]:
bc_data = pd.read_csv("../resources/breast-cancer/wdbc.data.csv")

In [11]:
bc_em, bc_x_train_scaled, bc_x_test_scaled, bc_y_train, bc_y_test = scale_and_train(12345, .65, bc_data, 2, 'bc')

In [12]:
bc_predictions = bc_em.predict(bc_x_test_scaled)

In [13]:
bc_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,

In [14]:
bc_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,

In [15]:
np.sum((bc_predictions-bc_y_test)**2)

23

In [16]:
bc_diverging_indecies = error_indices(bc_predictions, bc_y_test)
bc_diverging_indecies

[{'index': 20, 'p': 0, 'a': 1},
 {'index': 42, 'p': 0, 'a': 1},
 {'index': 52, 'p': 0, 'a': 1},
 {'index': 61, 'p': 0, 'a': 1},
 {'index': 75, 'p': 0, 'a': 1},
 {'index': 82, 'p': 0, 'a': 1},
 {'index': 89, 'p': 0, 'a': 1},
 {'index': 113, 'p': 0, 'a': 1},
 {'index': 128, 'p': 1, 'a': 0},
 {'index': 140, 'p': 0, 'a': 1},
 {'index': 143, 'p': 1, 'a': 0},
 {'index': 179, 'p': 1, 'a': 0},
 {'index': 186, 'p': 0, 'a': 1},
 {'index': 192, 'p': 1, 'a': 0},
 {'index': 212, 'p': 0, 'a': 1},
 {'index': 224, 'p': 0, 'a': 1},
 {'index': 252, 'p': 1, 'a': 0},
 {'index': 257, 'p': 0, 'a': 1},
 {'index': 269, 'p': 0, 'a': 1},
 {'index': 293, 'p': 0, 'a': 1},
 {'index': 309, 'p': 0, 'a': 1},
 {'index': 330, 'p': 1, 'a': 0},
 {'index': 334, 'p': 0, 'a': 1}]

In [17]:
len(bc_predictions)


370

prediction accuracy

In [18]:
(len(bc_predictions)-len(bc_diverging_indecies))/len(bc_predictions)


0.9378378378378378

In [41]:
scale_train_test(12345, .65, bc_data, 2, 'bc')


0.9378378378378378

### Iris Dataset
This section analyzes the Iris Dataset

In [19]:
ir_data_map = datasets.load_iris()
ir_data = pd.DataFrame(data=ir_data_map.data, columns=ir_data_map.feature_names)

In [20]:
ir_em, ir_x_train_scaled, ir_x_test_scaled, ir_y_train, ir_y_test = scale_and_train(12345, .65, ir_data, 3, 'ir', labels=ir_data_map.target)

In [33]:
ir_predictions = ir_em.predict(ir_x_test_scaled)

In [34]:
ir_diverging_indecies = error_indices(ir_predictions, ir_y_test)

In [35]:
ir_diverging_indecies

[{'index': 0, 'p': 2, 'a': 1},
 {'index': 2, 'p': 2, 'a': 1},
 {'index': 8, 'p': 2, 'a': 1},
 {'index': 9, 'p': 1, 'a': 2},
 {'index': 11, 'p': 1, 'a': 2},
 {'index': 12, 'p': 2, 'a': 1},
 {'index': 14, 'p': 1, 'a': 2},
 {'index': 15, 'p': 1, 'a': 2},
 {'index': 17, 'p': 1, 'a': 2},
 {'index': 18, 'p': 2, 'a': 1},
 {'index': 19, 'p': 2, 'a': 1},
 {'index': 21, 'p': 2, 'a': 1},
 {'index': 23, 'p': 2, 'a': 1},
 {'index': 25, 'p': 1, 'a': 2},
 {'index': 27, 'p': 2, 'a': 1},
 {'index': 29, 'p': 1, 'a': 2},
 {'index': 32, 'p': 1, 'a': 2},
 {'index': 33, 'p': 1, 'a': 2},
 {'index': 34, 'p': 2, 'a': 1},
 {'index': 36, 'p': 1, 'a': 2},
 {'index': 37, 'p': 2, 'a': 1},
 {'index': 39, 'p': 2, 'a': 1},
 {'index': 41, 'p': 2, 'a': 1},
 {'index': 43, 'p': 1, 'a': 2},
 {'index': 44, 'p': 2, 'a': 1},
 {'index': 45, 'p': 2, 'a': 1},
 {'index': 47, 'p': 2, 'a': 1},
 {'index': 48, 'p': 2, 'a': 1},
 {'index': 49, 'p': 2, 'a': 1},
 {'index': 50, 'p': 1, 'a': 2},
 {'index': 52, 'p': 2, 'a': 1},
 {'index': 5

In [36]:
(len(ir_predictions)-len(ir_diverging_indecies))/len(ir_predictions)

0.3877551020408163

In [40]:
scale_train_test(12345, .65, ir_data, 3, 'ir', labels=ir_data_map.target)

0.3877551020408163