In [1]:
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

%matplotlib widget

matplotlib.rc('font', size=16) 
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14)

# 1) Data exploration

In [2]:
filepath = join("data", "fruits.csv")

# load the dataframe
fruits = pd.read_csv(filepath)

# show the dataframe
fruits.tail()

Unnamed: 0,size,weight,smoothness,red,green,blue,label
1019,5.7,94.3,9.5,180,37,1,apple
1020,7.1,92.9,9.4,223,32,3,apple
1021,7.1,81.8,9.6,223,13,2,apple
1022,3.3,154.5,7.2,224,102,7,orange
1023,7.0,97.8,9.6,235,36,10,apple


In [3]:
# summarize some statistical quantities
fruits.describe()

Unnamed: 0,size,weight,smoothness,red,green,blue
count,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0
mean,6.940527,112.49209,8.488965,220.481445,69.402344,4.073242
std,1.818824,31.632433,1.112475,24.809334,48.036625,3.207321
min,1.8,50.4,5.3,110.0,1.0,0.0
25%,5.7,84.775,7.5,209.0,30.0,2.0
50%,6.9,102.45,9.1,228.0,46.0,3.0
75%,8.2,140.625,9.5,238.0,107.0,6.0
max,13.3,186.7,10.0,255.0,213.0,18.0


In [4]:
# size histogram
plt.figure()
plt.hist(fruits["size"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Size (cm)")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
# weight histogram
plt.figure()
plt.hist(fruits["weight"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Weight (g)")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [6]:
# smoothness histogram
plt.figure()
plt.hist(fruits["smoothness"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Smoothness (a.u.)")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
# redness histogram
plt.figure()
plt.hist(fruits["red"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Red Channel")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
# greenness histogram
plt.figure()
plt.hist(fruits["green"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Green Channel")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
# blueness histogram
plt.figure()
plt.hist(fruits["blue"], density=True, bins=16, color=(0.3, 0.7, 0.9))
plt.xlabel("Blue Channel")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 2) Preprocessing

In [10]:
# split the data in training and testing
train, test = train_test_split(fruits, test_size=0.2, random_state=1)

# separate labels and features
train_label = train.pop("label")
test_label = test.pop("label")
# convert the labels to 0s and 1s
label_encoder = LabelEncoder()
label_encoder.fit(["orange", "apple"])
train_label = label_encoder.transform(train_label)
test_label = label_encoder.transform(test_label)

In [11]:
# scale the training and test data
standard_scaler = StandardScaler()
train_scaled = standard_scaler.fit_transform(train) 
test_scaled = standard_scaler.transform(test)

# append the train and test data for plotting
full_scaled = np.concatenate((train_scaled, test_scaled), axis=0)

# get the number of features
n_f = full_scaled.shape[1]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [12]:
# show training data
train.tail()

Unnamed: 0,size,weight,smoothness,red,green,blue
767,3.6,148.3,7.9,238,116,7
72,2.9,178.1,7.2,246,53,4
908,11.3,136.2,7.8,240,172,1
235,6.1,94.2,9.3,228,47,11
37,7.2,81.1,9.5,219,29,1


In [13]:
print(f"The training data has {train_scaled.shape[0]} samples and {train_scaled.shape[1]} features.")

The training data has 819 samples and 6 features.


In [14]:
# show the testing data
test.tail()

Unnamed: 0,size,weight,smoothness,red,green,blue
770,7.5,125.6,7.8,234,158,6
582,5.4,122.3,7.2,224,30,8
1009,7.6,87.9,10.0,168,28,0
267,3.4,121.9,6.6,227,148,4
974,5.4,92.9,9.8,220,13,6


In [15]:
print(f"The test data has {test_scaled.shape[0]} samples and {test_scaled.shape[1]} features.")

The test data has 205 samples and 6 features.


In [16]:
# histogram of scaled redness
plt.figure()
plt.hist(full_scaled[:, fruits.columns[:-1]=="red"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Red Channel (scaled)")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
# histogram of scaled smoothness
plt.figure()
plt.hist(full_scaled[:, fruits.columns[:-1]=="smoothness"], density=True, bins=32, color=(0.3, 0.7, 0.9))
plt.xlabel("Smoothness (scaled)")
plt.ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
# initialize the PCA model
pca = PCA(n_components=n_f)

# fit the PCA model to the training data
train_pca = pca.fit_transform(train_scaled)
# transform the test data
test_pca = pca.transform(test_scaled)

# get the explained variance ratios
exp_var_n2 = np.sum(pca.explained_variance_ratio_[:2])

print(f"The first two principle components explain {exp_var_n2 * 100:.1f}% of the variance in the data")

The first two principle components explain 64.8% of the variance in the data


In [19]:
# only consider the frist two principal components
train_pca = train_pca[:, :2]
test_pca = test_pca[:, :2]

In [20]:
# show the weight of the features in the components
loadings = pd.DataFrame(pca.components_[:2].T, columns=['PC1', 'PC2'], index=train.columns)
# save the loading
filepath = join("data", "pca_loadings.csv")
loadings.to_csv(filepath, index=True)

# show the loadings
loadings

Unnamed: 0,PC1,PC2
size,0.001397,-0.687627
weight,-0.525131,-0.002579
smoothness,0.538777,8.3e-05
red,-0.413465,0.027993
green,-0.512668,-0.00306
blue,0.013186,0.725513


In [21]:
# scatter plot of the first two features
plt.figure()
plt.scatter(fruits["size"], fruits["weight"],
           edgecolor=(0.2, 0.2, 0.2),
            facecolor=(0.8, 0.8, 0.8), 
            s=20
           )
plt.xlabel("Size (cm)")
plt.ylabel("Weight (g)")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
# scatter plot of the first three features
fig = plt.figure()
ax = fig.gca(projection="3d")
ax.scatter(fruits["size"], fruits["weight"], fruits["smoothness"],
            edgecolor=(0.2, 0.2, 0.2),
            facecolor=(0.8, 0.8, 0.8), 
            s=20)
ax.view_init(45, 45)
ax.set_xlabel("Size (cm)", labelpad=10)
ax.set_ylabel("Weight (g)", labelpad=10)
ax.set_zlabel("Smoothness (a.u.)", labelpad=10)
ax.set_yticks([60, 100, 140, 180])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.axis.XTick at 0x11be4500518>,
 <matplotlib.axis.XTick at 0x11be44e9b70>,
 <matplotlib.axis.XTick at 0x11be451d8d0>,
 <matplotlib.axis.XTick at 0x11be451dda0>]

In [23]:
print(f"The engineered training data has {train_pca.shape[0]} samples and {train_pca.shape[1]} features.")

The engineered training data has 819 samples and 2 features.


In [24]:
# merge the transformed training and test data for plotting
full_pca = np.concatenate((train_pca, test_pca), axis=0)

# plot the dataset along the principal axis
plt.figure()
plt.scatter(full_pca[:, 0], full_pca[:, 1], 
            edgecolor=(0.2, 0.2, 0.2),
            facecolor=(0.8, 0.8, 0.8), 
            s=20)
plt.xlabel("$X_{PC_1}$")
plt.ylabel("$X_{PC_2}$")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 3) Machine Learning (Unsupervised)

In [32]:
# show the training samples
plt.figure()
plt.scatter(train_pca[:, 0], train_pca[:, 1], 
            edgecolor=(0.2, 0.2, 0.2),
            facecolor=(0.8, 0.8, 0.8), 
            s=20)
plt.xlabel("$X_{PC_1}$")
plt.ylabel("$X_{PC_2}$")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [33]:
# define maximum number of clusters
n_cluster_max = 8
n_cluster = np.arange(1, n_cluster_max + 1)
scores = np.empty(n_cluster_max)

for ii in range(n_cluster_max):
    # create model
    k_means = KMeans(n_cluster[ii])
    # fit the model to the data
    k_means.fit(train_pca)
    # score the model
    scores[ii] = k_means.score(train_pca)

# plot scores to apply elbow method
plt.figure()
plt.scatter(n_cluster, scores, 
            s=60,
            edgecolor=(1, 0.4, 0.2),
            facecolor=(0.3, 0.7, 0.9), 
            zorder=10)
plt.plot(n_cluster, scores, 
         color=(0.75, 0.9, 1), 
         linestyle="--",
         zorder=5)
plt.xlabel("$n_\\mathrm{cluster}$")
plt.ylabel("Scores")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
# fit the model to the training data
k_means = KMeans(2)
k_means.fit(train_pca)

# get the k-means
cluster_centers = k_means.cluster_centers_

In [45]:
# predict the training labels
train_label_predict = k_means.predict(train_pca)

# plot the transformed train data
train_pca_0 = train_pca[train_label_predict==0, :]
train_pca_1 = train_pca[train_label_predict==1, :]

# color to mark wrong labels
color_1 = (0.3, 0.7, 0.9)
color_0 = (1, 0.4, 0.2)
facecolor_0 = np.copy(color_0)
facecolor_1 = np.copy(color_1)

# plot the clustered training samples 
plt.figure()
plt.scatter(train_pca_0[:, 0], train_pca_0[:, 1], facecolor=facecolor_0, s=20)
plt.scatter(train_pca_1[:, 0], train_pca_1[:, 1], facecolor=facecolor_1, s=20)
plt.scatter(cluster_centers[0, 0], cluster_centers[0, 1], facecolor=(0, 0, 0), marker="x", s=100)
plt.scatter(cluster_centers[1, 0], cluster_centers[1, 1], facecolor=(0, 0, 0), marker="x", s=100)
plt.xlabel("$X_{PC_1}$")
plt.ylabel("$X_{PC_2}$")
plt.tight_layout()

  from ipykernel import kernelapp as app


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [38]:
# predict the labels of the test data
test_label_predict = k_means.predict(test_pca)

# plot the transformed train data
test_pca_0 = test_pca[test_label_predict==0, :]
test_pca_1 = test_pca[test_label_predict==1, :]

plt.figure()
plt.scatter(test_pca[:, 0], test_pca[:, 1], 
            edgecolor=(0.2, 0.2, 0.2),
            facecolor=(0.8, 0.8, 0.8), 
            s=20)
plt.xlabel("$X_{PC_1}$")
plt.ylabel("$X_{PC_2}$")
plt.tight_layout()

  


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [44]:
# color to mark wrong labels
color_1 = (0.3, 0.7, 0.9)
color_0 = (1, 0.4, 0.2)
facecolor_0 = np.copy(color_0)
facecolor_1 = np.copy(color_1)

# plot the clustered test samples
plt.figure()
plt.scatter(test_pca_0[:, 0], test_pca_0[:, 1], facecolor=facecolor_0, s=20)
plt.scatter(test_pca_1[:, 0], test_pca_1[:, 1], facecolor=facecolor_1, s=20)
plt.scatter(cluster_centers[0, 0], cluster_centers[0, 1], facecolor=(0, 0, 0), marker="x", s=100)
plt.scatter(cluster_centers[1, 0], cluster_centers[1, 1], facecolor=(0, 0, 0), marker="x", s=100)
plt.xlabel("$X_{PC_1}$")
plt.ylabel("$X_{PC_2}$")
plt.tight_layout()

  


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …