<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Overview</a></span></li></ul></li><li><span><a href="#Load-the-Data" data-toc-modified-id="Load-the-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the Data</a></span></li><li><span><a href="#Model-the-Data-with-PCA" data-toc-modified-id="Model-the-Data-with-PCA-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Model the Data with PCA</a></span></li></ul></div>

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn

import sys
sys.path.append('../../')
import chemometrics

import numpy as np
import pandas as pd

import watermark
%load_ext watermark

Overview
--------

This is an example of how to use PCA to inspect data for outliers or extreme points.

In [2]:
%watermark -t -m -v --iversions

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.19.0

Compiler    : GCC 7.3.0
OS          : Linux
Release     : 4.15.0-166-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit

pandas    : 1.1.4
sklearn   : 0.24.0
numpy     : 1.19.4
watermark : 2.1.0
matplotlib: 3.3.1
sys       : 3.7.9 (default, Aug 31 2020, 12:42:55) 
[GCC 7.3.0]



# Load the Data

In [3]:
# Let's load some data from the tests/ for this example
df = pd.read_csv('../tests/data/pca_train.csv')

In [4]:
df

Unnamed: 0,Group,Name,Li,B,Na,Mg,Al,K,Ca,V,...,Eu,Gd,Dy,Ho,Er,Tm,Yb,Lu,Pb,U
0,1,jpn_001,0.001287,0.435363,2.567587,143.601117,0.009235,818.961080,36.075419,0.000399,...,1.318100e-04,0.000629,0.000041,0.000108,0.000385,0.000182,0.000295,0.000059,0.000182,0.000174
1,1,jpn_002,0.001474,0.385210,4.595786,276.591018,0.084693,863.273852,50.704790,0.000337,...,1.823270e-04,0.000592,0.000386,0.000051,0.000148,0.000021,0.000003,0.000023,0.000443,0.000723
2,1,jpn_003,0.000748,0.289601,5.806715,117.037380,0.119564,851.174760,46.020288,0.000393,...,7.401510e-07,0.000717,0.000353,0.000192,0.000434,0.000194,0.000054,0.000016,0.001359,0.000029
3,1,jpn_004,0.000882,0.525801,0.554544,335.195531,0.388480,836.126629,45.437616,0.000782,...,1.283760e-04,0.000481,0.000124,0.000041,0.000012,0.000180,0.000264,0.000029,0.000948,0.000236
4,1,jpn_005,0.001387,0.659031,3.102831,213.051823,0.106865,756.238004,39.155470,0.000444,...,1.014040e-04,0.000464,0.000608,0.000031,0.000178,0.000102,0.000050,0.000125,0.001060,0.000130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,1,jpn_105,0.000534,1.167810,3.763093,641.958500,0.191345,1173.494000,60.286310,0.009183,...,2.912780e-06,0.000036,0.000008,0.000115,0.000050,0.000042,0.000559,0.000036,0.046020,0.001162
65,1,jpn_106,0.000227,0.838623,8.488073,695.211600,1.578177,1593.234000,56.021290,0.006510,...,3.015570e-05,0.000181,0.000185,0.000124,0.000103,0.000003,0.000258,0.000130,0.040487,0.003472
66,1,jpn_109,0.000276,0.816621,8.107320,514.423200,0.193725,1313.915000,44.854700,0.025300,...,1.523280e-04,0.000144,0.000081,0.000003,0.000037,0.000065,0.000564,0.000084,0.010477,0.000657
67,1,jpn_110,0.000486,1.125498,3.873387,597.009000,1.034398,1353.323000,52.432030,0.016496,...,2.644190e-05,0.000102,0.000145,0.000192,0.000152,0.000130,0.000425,0.000076,0.008345,0.002247


In [5]:
raw_x = np.array(df.values[:,2:], dtype=float) # Extract features

# Model the Data with PCA

In [6]:
from chemometrics.classifier.pca import PCA

In [None]:
model = PCA(n_components=2, alpha=0.05, gamma=0.01, scale_x=True)

In [None]:
_ = model.fit(raw_x)

In [None]:
_ = model.visualize(raw_x)

In [None]:
extremes_mask, outliers_mask = model.check_outliers(raw_x)

In [None]:
regular_mask = model.predict(raw_x)

In [None]:
np.any(extremes_mask)

In [None]:
np.any(outliers_mask)

In [None]:
t = model.transform(raw_x)
plt.plot(t[~extremes_mask,0], t[~extremes_mask,1], 
         marker='o',
        lw=0,
         color='g'
        )
plt.plot(t[extremes_mask,0], t[extremes_mask,1], 
         marker='o',
        lw=0,
         color='r'
        )
_ = plt.axis('equal')