In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

## Fish Catch Dataset

In [3]:
# Dataset from http://ww2.amstat.org/publications/jse/jse_data_archive.htm
columns = ['Obs', 'Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height%', 'Width%', 'Sex']
fish = pd.read_csv('http://ww2.amstat.org/publications/jse/datasets/fishcatch.dat.txt', 
                 delim_whitespace=True, index_col=0, names=columns)
fish.head()

Unnamed: 0_level_0,Species,Weight,Length1,Length2,Length3,Height%,Width%,Sex
Obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,242.0,23.2,25.4,30.0,38.4,13.4,
2,1,290.0,24.0,26.3,31.2,40.0,13.8,
3,1,340.0,23.9,26.5,31.1,39.8,15.1,
4,1,363.0,26.3,29.0,33.5,38.0,13.3,
5,1,430.0,26.5,29.0,34.0,36.6,15.1,


In [5]:
fish.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159 entries, 1 to 159
Data columns (total 8 columns):
Species    159 non-null int64
Weight     158 non-null float64
Length1    159 non-null float64
Length2    159 non-null float64
Length3    159 non-null float64
Height%    159 non-null float64
Width%     159 non-null float64
Sex        72 non-null float64
dtypes: float64(7), int64(1)
memory usage: 11.2 KB


In [3]:
# Drop NaN column
fish.drop('Sex', axis=1, inplace=True)
fish.head()

Unnamed: 0_level_0,Species,Weight,Length1,Length2,Length3,Height%,Width%
Obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,242.0,23.2,25.4,30.0,38.4,13.4
2,1,290.0,24.0,26.3,31.2,40.0,13.8
3,1,340.0,23.9,26.5,31.1,39.8,15.1
4,1,363.0,26.3,29.0,33.5,38.0,13.3
5,1,430.0,26.5,29.0,34.0,36.6,15.1


In [4]:
# Drop NaN values
fish.dropna(inplace=True)

In [16]:
# DataFrame to NumPy Array
samples = fish.iloc[:, 1:].values
species = fish.iloc[:, 0].values
samples.shape, species.shape

((158, 6), (158,))

## Scaling fish data for clustering

In [17]:
# Create scaler
scaler = StandardScaler()

In [28]:
# Create KMeans instance
kmeans = KMeans(n_clusters=7)

In [29]:
# Create pipeline
pipeline = make_pipeline(scaler, kmeans)

## Clustering the fish data

In [30]:
# Fit the pipeline to samples
pipeline.fit(samples)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

In [31]:
# Calculate the cluster labels
labels = pipeline.predict(samples)

In [32]:
# Create a DataFrame with labels and species as columns
df = pd.DataFrame({'labels': labels, 'species': species})
df.head()

Unnamed: 0,labels,species
0,5,1
1,5,1
2,5,1
3,1,1
4,1,1


In [33]:
# Create crosstab
ct = pd.crosstab(df['labels'], df['species'])

In [34]:
# Display ct
print(ct)

species   1  2   3   4   5   6   7
labels                            
0         0  0   0   0  14   0   0
1        31  0   0   0   0   0   0
2         0  3  20   0   0   0  37
3         0  0   0   0   0  12   0
4         0  3   0   0   0   0  19
5         3  0   0  11   0   0   0
6         0  0   0   0   0   5   0
