In [8]:
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

## Fish Catch Dataset

In [3]:
# Dataset from http://ww2.amstat.org/publications/jse/jse_data_archive.htm
columns = ['Obs', 'Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height%', 'Width%', 'Sex']
fish = pd.read_csv('http://ww2.amstat.org/publications/jse/datasets/fishcatch.dat.txt', 
                 delim_whitespace=True, index_col=0, names=columns)

# Drop NaN column
fish.drop('Sex', axis=1, inplace=True)

# Drop NaN values
fish.dropna(inplace=True)
fish.head()

Unnamed: 0_level_0,Species,Weight,Length1,Length2,Length3,Height%,Width%
Obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,242.0,23.2,25.4,30.0,38.4,13.4
2,1,290.0,24.0,26.3,31.2,40.0,13.8
3,1,340.0,23.9,26.5,31.1,39.8,15.1
4,1,363.0,26.3,29.0,33.5,38.0,13.3
5,1,430.0,26.5,29.0,34.0,36.6,15.1


In [7]:
# DataFrame to NumPy Array
samples = fish.iloc[:, 1:].values
species = fish.iloc[:, 0].values
samples.shape, species.shape

((158, 6), (158,))

## Scaling fish data for dimension reduction

In [9]:
# Scale 'samples'
scaled_samples = scale(samples)

## Dimension reduction

In [5]:
# Create a PCA model with 2 components
pca = PCA(n_components=2)

In [10]:
# Fit the PCA instance to the scaled samples
pca.fit(scaled_samples)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [12]:
# Transform the scaled samples
pca_features = pca.transform(scaled_samples)
pca_features.shape

(158, 2)