In [None]:
## Madeline Hayes ##
## ECE532002 Final Project ##
## Part 1: K-means clustering ##

In [1]:
# package import

import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt 
import pandas as pd
import math
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import seaborn as sns

# set wd

os.chdir("\\Users\\madel\\Documents\\Grad School\\Classes\\Fall 2020\\ECE532\\Final Project")
os.getcwd() 
#os.listdir(os.getcwd()) 

'C:\\Users\\madel\\Documents\\Grad School\\Classes\\Fall 2020\\ECE532\\Final Project'

In [19]:
## note: if you want to run this code yourself
## re-import data (run this block) before each iteration
## numbered 1-4

# import data
reddf = pd.read_csv("winequality-red.csv",delimiter=';')
whitedf = pd.read_csv("winequality-white.csv",delimiter=';')

# add color label
reddf['color'] = str('red')
whitedf['color']=str('white')
    
# stack df's 
fulldf = pd.concat([reddf, whitedf], ignore_index=True)
#fulldf.head()
#fulldf.tail()

In [None]:
### Hypothesis: Including all features will lead to noisy clustering 
### and a higher error rate than excluding some features

In [3]:
### 1. All Data, No Scaling ###

## this is an unsupervised problem
## so we remove color label
trimdf = fulldf.drop('color',axis=1)

In [4]:
## kmeans doesn't handle unscaled data well ##
## to verify that scaling the data is appropriate, 
## we'll first run the problem without scaling ##

unscaled_trimdf = trimdf

kmeans = KMeans(n_clusters=2, init='random', n_init=50, random_state=0).fit(unscaled_trimdf)
# n_clusters: ground truth is there are two clusters, reds and whites
# init = 'random' chooses random initial guesses for the centroids, best to avoid local minima
# n_init = 50 is max iterations
# random_state = 0 defines the random number generator for init
print('Cluster Centers: ' , str(kmeans.cluster_centers_))
print('SS distances: ' + str(kmeans.inertia_))
print('Number of Iterations: ' + str(kmeans.n_iter_))

Cluster Centers:  [[7.61753270e+00 4.07887946e-01 2.91205373e-01 3.08405797e+00
  6.56726759e-02 1.84515730e+01 6.36139979e+01 9.94565716e-01
  3.25525274e+00 5.71785083e-01 1.07956816e+01 5.80982679e+00]
 [6.90508451e+00 2.87048800e-01 3.39787350e-01 7.26278626e+00
  4.85997819e-02 3.98373773e+01 1.55951063e+02 9.94797606e-01
  3.19015540e+00 5.00019084e-01 1.02574282e+01 5.82497274e+00]]
SS distances: 8594875.947990824
Number of Iterations: 18


In [5]:
# add kmeans cluster labels to df

labels = kmeans.labels_
labels1 = labels.tolist()
fulldf['label'] = labels1
#fulldf.head()

In [7]:
## we'll check for errors by comapring 'color' and 'label' columns ##
## assume Red = 0 ##
## started with red = 1 but the error rate was >50% 
## switching labels doesn't change the cluster,
## just the interpretation

red_correct = []
red_incorrect = []
white_correct = []
white_incorrect = []

for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==0:
        red_correct.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==0:
        white_incorrect.append(z)
    elif fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==1:
        red_incorrect.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==1:
        white_correct.append(z)

n_reds = len(reddf.index.values)
red_errors = len(red_incorrect)/n_reds
print('Red Error Rate: ' + str(red_errors*100) + '%')
n_whites = len(whitedf.index.values)
white_errors = len(white_incorrect)/n_whites
print('White Error Rate: ' + str(white_errors*100) + '%') 
n_total = len(fulldf.index.values)
n_errors = (len(red_incorrect)+len(white_incorrect))/n_total
print('Full Data Error Rate: ' + str(n_errors*100) + '%')

## also want a list of the true labels here for performance evaluation,
## see section 5 for more details

true_labels1 = []
for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red':
        true_labels1.append('0')
    elif fulldf.at[z, 'color'] == 'white':
        true_labels1.append('1')

Red Error Rate: 5.190744215134459%
White Error Rate: 26.745610453246222%
Full Data Error Rate: 21.440664922271818%


In [9]:
### 2. All Data, Scaled ###

# scale and transform the data
# since kmeans clustering doesn't handle irregularly shaped data well
# each parameter will be scaled between 0 and 1
X=trimdf
scaler = MinMaxScaler() 
scaler.fit(X)
scaleddf=pd.DataFrame(scaler.transform(X))

In [10]:
# run kmeans on scaled df
kmeans = KMeans(n_clusters=2, init='random', n_init=50, random_state=0).fit(scaleddf)
print('Cluster Centers: ' , str(kmeans.cluster_centers_))
print('SS distances: ' + str(kmeans.inertia_))
print('Number of Iterations: ' + str(kmeans.n_iter_))

Cluster Centers:  [[0.29206849 0.18481443 0.18996717 0.09349989 0.08957278 0.10941891
  0.28034535 0.17506207 0.37345947 0.17378814 0.24537775 0.40030422]
 [0.26812159 0.15624969 0.19480099 0.04659765 0.06164338 0.09257737
  0.21328209 0.10477242 0.40512758 0.17642813 0.52788939 0.56974706]]
SS distances: 699.4912445148213
Number of Iterations: 8


In [11]:
# add kmeans cluster labels to df

labels = kmeans.labels_
labels2 = labels.tolist()
fulldf['label'] = labels2
#fulldf.head()

In [13]:
## we'll check for errors by comapring 'color' and 'label' columns ##
## assume Red = 1 
## started with red = 0 as before,
## but the error rate was >50% for 2/3 metrics (white and total)
## switching labels doesn't change the cluster,
## just the interpretation, try to minimize error

red_correct = []
red_incorrect = []
white_correct = []
white_incorrect = []

for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==1:
        red_correct.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==1:
        white_incorrect.append(z)
    elif fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==0:
        red_incorrect.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==0:
        white_correct.append(z)

n_reds = len(reddf.index.values)
red_errors = len(red_incorrect)/n_reds
print('Red Error Rate: ' + str(red_errors*100) + '%')
n_whites = len(whitedf.index.values)
white_errors = len(white_incorrect)/n_whites
print('White Error Rate: ' + str(white_errors*100) + '%') 
n_total = len(fulldf.index.values)
n_errors = (len(red_incorrect)+len(white_incorrect))/n_total
print('Full Data Error Rate: ' + str(n_errors*100) + '%')

## also want a list of the true labels here for performance evaluation,
## see section 5 for more details

true_labels2 = []
for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red':
        true_labels2.append('1')
    elif fulldf.at[z, 'color'] == 'white':
        true_labels2.append('0')

Red Error Rate: 62.601626016260155%
White Error Rate: 42.139648836259695%
Full Data Error Rate: 47.17561951670002%


In [15]:
### 3. Remove Some Features, Unscaled ###
## repeat unscaled and scaled comparison
# alcohol content and rating show similar distributions 
# between red and white, and might not be predictive

# remove quality and alcohol content
trimdf = fulldf.drop('color',axis=1)
trimdf = trimdf.drop('quality',axis=1)
trimdf = trimdf.drop('alcohol',axis=1)
#trimdf.head()

In [16]:
## kmeans doesn't handle unscaled data well ##
## to verify that scaling the data is appropriate, 
## we'll first run the problem without scaling ##

unscaled_trimdf = trimdf

kmeans = KMeans(n_clusters=2, init='random', n_init=50, random_state=0).fit(unscaled_trimdf)
print('Cluster Centers: ' , str(kmeans.cluster_centers_))
print('SS distances: ' + str(kmeans.inertia_))
print('Number of Iterations: ' + str(kmeans.n_iter_))

Cluster Centers:  [[7.61753890e+00 4.07936704e-01 2.91159830e-01 3.08422914e+00
  6.56803395e-02 1.84478430e+01 6.35975955e+01 9.94566906e-01
  3.25523338e+00 5.71806931e-01]
 [6.90527392e+00 2.87044154e-01 3.39809212e-01 7.26151540e+00
  4.85985282e-02 3.98344235e+01 1.55938539e+02 9.94796626e-01
  3.19018806e+00 5.00021804e-01]]
SS distances: 8581133.746330764
Number of Iterations: 18


In [17]:
# add kmeans cluster labels to df

labels3 = kmeans.labels_
labels = labels.tolist()
fulldf['label'] = labels3
#fulldf.head()

In [18]:
## we'll check for errors by comapring 'color' and 'label' columns ##
##  assume Red = 0 
## again, assuming labels to minimize errors since we know the labels

red_correct = []
red_incorrect = []
white_correct = []
white_incorrect = []

for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==0:
        red_correct.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==0:
        white_incorrect.append(z)
    elif fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==1:
        red_incorrect.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==1:
        white_correct.append(z)

n_reds = len(reddf.index.values)
red_errors = len(red_incorrect)/n_reds
print('Red Error Rate: ' + str(red_errors*100) + '%')
n_whites = len(whitedf.index.values)
white_errors = len(white_incorrect)/n_whites
print('White Error Rate: ' + str(white_errors*100) + '%') 
n_total = len(fulldf.index.values)
n_errors = (len(red_incorrect)+len(white_incorrect))/n_total
print('Full Data Error Rate: ' + str(n_errors*100) + '%')

## also want a list of the true labels here for performance evaluation,
## see section 5 for more details

true_labels3 = []
for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red':
        true_labels3.append('0')
    elif fulldf.at[z, 'color'] == 'white':
        true_labels3.append('1')

Red Error Rate: 5.190744215134459%
White Error Rate: 26.745610453246222%
Full Data Error Rate: 21.440664922271818%


In [20]:
### 4. Remove Some Features, Scaled ###

# as above, 
# alcohol content and rating show similar distributions 
# between red and white, and might not be predictive

trimdf = fulldf.drop('color',axis=1)
trimdf = trimdf.drop('quality',axis=1)
trimdf = trimdf.drop('alcohol',axis=1)

# scale and transform the data
# since kmeans clustering doesn't handle irregularly shaped data
# each parameter will be scaled between 0 and 1

X=trimdf
scaler = MinMaxScaler() 
scaler.fit(X)
scaleddf=pd.DataFrame(scaler.transform(X))

In [21]:
# run kmeans on scaled df
kmeans = KMeans(n_clusters=2, init='random', n_init=50, random_state=0).fit(scaleddf)
print('Cluster Centers: ' , str(kmeans.cluster_centers_))
print('SS distances: ' + str(kmeans.inertia_))
print('Number of Iterations: ' + str(kmeans.n_iter_))

Cluster Centers:  [[0.25302069 0.1295245  0.20279466 0.08930091 0.06146042 0.1202644
  0.30717106 0.13376085 0.36115965 0.151668  ]
 [0.36793108 0.30083686 0.16016152 0.03027309 0.12697642 0.05051527
  0.09373545 0.18289756 0.4605017  0.24286112]]
SS distances: 410.2867585703211
Number of Iterations: 5


In [22]:
# add kmeans cluster labels to df

labels = kmeans.labels_
labels4 = labels.tolist()
fulldf['label'] = labels4
#fulldf.head()

In [23]:
## we'll check for errors by comapring 'color' and 'label' columns ##
##  assume Red = 1 
## again, assuming labels to minimize errors since we know the labels

red_correct = []
red_incorrect = []
white_correct = []
white_incorrect = []

for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==1:
        red_correct.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==1:
        white_incorrect.append(z)
    elif fulldf.at[z, 'color'] == 'red' and fulldf.at[z, 'label']==0:
        red_incorrect.append(z)
    elif fulldf.at[z,'color']=='white' and fulldf.at[z,'label']==0:
        white_correct.append(z)

n_reds = len(reddf.index.values)
red_errors = len(red_incorrect)/n_reds
print('Red Error Rate: ' + str(red_errors*100) + '%')
n_whites = len(whitedf.index.values)
white_errors = len(white_incorrect)/n_whites
print('White Error Rate: ' + str(white_errors*100) + '%') 
n_total = len(fulldf.index.values)
n_errors = (len(red_incorrect)+len(white_incorrect))/n_total
print('Full Data Error Rate: ' + str(n_errors*100) + '%')

## also want a list of the true labels here for performance evaluation,
## see section 5 for more details

true_labels4 = []
for z in list(fulldf.index.values):
    if fulldf.at[z, 'color'] == 'red':
        true_labels4.append('0')
    elif fulldf.at[z, 'color'] == 'white':
        true_labels4.append('1')

Red Error Rate: 2.313946216385241%
White Error Rate: 1.8579011841567987%
Full Data Error Rate: 1.970140064645221%


In [41]:
### 5. Evaluating Performance ### 

## using homogeneity, completeness, and v-measure
## need to compare lists of true labels and predicted labels
## these will be for each run, as the 0 or 1 label was different

from sklearn import metrics

its = [1, 2, 3, 4]
homs = []
coms = []
vscores = []

## iteration 1: all features, unscaled
it1hom = metrics.homogeneity_score(true_labels1, labels1)
homs.append(it1hom)
it1com = metrics.completeness_score(true_labels1, labels1)
coms.append(it1com)
it1v = metrics.v_measure_score(true_labels1, labels1)
vscores.append(it1v)

## iteration 2: all features, scaled
it2hom = metrics.homogeneity_score(true_labels2, labels2)
homs.append(it2hom)
it2com = metrics.completeness_score(true_labels2, labels2)
coms.append(it2com)
it2v = metrics.v_measure_score(true_labels2, labels2)
vscores.append(it2v)

## iteration 3: selected features, unscaled
it3hom = metrics.homogeneity_score(true_labels3, labels3)
homs.append(it3hom)
it3com = metrics.completeness_score(true_labels3, labels3)
coms.append(it3com)
it3v = metrics.v_measure_score(true_labels3, labels3)
vscores.append(it3v)

## iteration 4: selected features, unscaled
it4hom = metrics.homogeneity_score(true_labels4, labels4)
homs.append(it4hom)
it4com = metrics.completeness_score(true_labels4, labels4)
coms.append(it4com)
it4v = metrics.v_measure_score(true_labels4, labels4)
vscores.append(it4v)

#import dataframe_image as dfi

vscoredf = pd.DataFrame(columns = ['Iteration', 'Homogeneity', 'Completeness','V-Score'])
vscoredf['Iteration']=its
vscoredf['Homogeneity']=homs
vscoredf['Completeness']=coms
vscoredf['V-Score']=vscores
vscoredf.head()


Unnamed: 0,Iteration,Homogeneity,Completeness,V-Score
0,1,0.352393,0.287212,0.316481
1,2,0.001556,0.001283,0.001406
2,3,0.352393,0.287212,0.316481
3,4,0.842913,0.829359,0.836081
