# The Decompose Pipeline Stage

In [1]:
import pandas as pd
import pdpipe as pdp
from sklearn.decomposition import PCA

In [9]:
data = [[3, 1, 1], [7, 2, 4], [8, 3, 1]]
df = pd.DataFrame(data, [1, 2, 3], ["a", "b", "c"])
df

Unnamed: 0,a,b,c
1,3,1,1
2,7,2,4
3,8,3,1


## A basic example

In [10]:
pca_stage = pdp.Decompose(PCA(), n_components=2)
pca_stage(df)

Unnamed: 0,mdc0,mdc1
1,3.313301,-0.148453
2,-1.432127,1.717269
3,-1.881174,-1.568816


Let's check this work with an unequal number of components:

In [11]:
pca_stage = pdp.Decompose(PCA(), n_components=3)
pca_stage(df)

Unnamed: 0,mdc0,mdc1,mdc2
1,3.313301,-0.148453,1.605608e-16
2,-1.432127,1.717269,1.605608e-16
3,-1.881174,-1.568816,1.605608e-16


## Working with column qualifiers

In [12]:
data = [[3, 1, 1, 2], [7, 2, 4, 8], [8, 3, 1, 5]]
df = pd.DataFrame(data, [1, 2, 3], ["a1", "a1", "a3", "d"])
df

Unnamed: 0,a1,a1.1,a3,d
1,3,1,1,2
2,7,2,4,8
3,8,3,1,5


In [13]:
pca_stage = pdp.Decompose(PCA(), columns=pdp.cq.StartWith("a"), n_components=2)
pca_stage(df)

Unnamed: 0,d,mdc0,mdc1
1,2,4.570998,-0.325541
2,8,-1.61927,1.837924
3,5,-2.951728,-1.512383


## Non-numeric data is automatically ignored

In [14]:
data = [[3, 1, 1, "a"], [7, 2, 4, "b"], [8, 3, 1, "c"]]
df = pd.DataFrame(data, [1, 2, 3], ["g", "ph", "ar", "co"])
df

Unnamed: 0,g,ph,ar,co
1,3,1,1,a
2,7,2,4,b
3,8,3,1,c


In [15]:
pca_stage = pdp.Decompose(PCA(), n_components=2)
pca_stage(df)

Unnamed: 0,co,mdc0,mdc1
1,a,3.313301,-0.148453
2,b,-1.432127,1.717269
3,c,-1.881174,-1.568816
