In [1]:
from reduced_ipca import ReducedPCA
from perturb_ipca import PerturbPCA
from utils import dict2numpy, vec2dict

## The code block

In [2]:
import numpy as np
from sklearn.decomposition import PCA

## Tests

### Sanity Check using gaussians 

In [6]:
np.random.seed(42)
test_data = np.random.normal(size=(10_000,10))*np.arange(10)
test_data = [{idx: i  for idx, i in enumerate(test_data[i])} for i in range(10_000)]
test_vec = {idx: i  for idx, i in enumerate(np.arange(10))} 
ipca = PerturbPCA(5)
for i in test_data:
    ipca.learn_one(i)
print(ipca.inverse_transform_one(ipca.transform_one(test_vec)))

{0: 5.220565644295931e-14, 1: 0.025878167476123367, 2: 0.011633957493537177, 3: 0.12293468327390743, 4: 0.12908795940825354, 5: 5.0955160538929185, 6: 5.9543034507849475, 7: 7.027052494195294, 8: 7.995053166631833, 9: 9.044772715368909}


In [7]:
np.random.seed(42)
test_data = np.random.normal(size=(10_000,10))*np.arange(10)
test_data = [{idx: i  for idx, i in enumerate(test_data[i])} for i in range(10_000)]
test_vec = {idx: i  for idx, i in enumerate(np.arange(10))} 
ipca = ReducedPCA(5)
for i in test_data:
    ipca.learn_one(i)
print(ipca.inverse_transform_one(ipca.transform_one(test_vec)))


{0: 0.0, 1: 0.025599074602075535, 2: 0.01266575746956531, 3: 0.12288722014933189, 4: 0.2844437990407748, 5: 5.18922495580125, 6: 5.970517303933131, 7: 7.023804221989852, 8: 7.992712204209254, 9: 9.046572082523122}


### Visualising the IRIS dataset 

**We do IRIS dataset clustering using our Reduced PCA algorithm**

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from river.stream import iter_pandas
# unused but required import for doing 3d projections with matplotlib < 3.2

np.random.seed(5)

iris = datasets.load_iris(as_frame=True)
X = iris.data
print(type(X))
y = iris.target
stream = list(iter_pandas(X,y))
online_pca = ReducedPCA(2)
for (x,y) in stream:
    online_pca.learn_one(x)
    
results = np.array([dict2numpy(online_pca.transform_one(x)) for (x,y) in stream])
plt.scatter(results[:, 0], results[:, 1],
            c=iris.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('gnuplot', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

**We do IRIS dataset clustering using our PerturbPCA algorithm**

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from river.stream import iter_pandas
# unused but required import for doing 3d projections with matplotlib < 3.2

np.random.seed(5)

iris = datasets.load_iris(as_frame=True)
X = iris.data
print(type(X))
y = iris.target
stream = list(iter_pandas(X,y))
online_pca = PerturbPCA(2)
for (x,y) in stream:
    online_pca.learn_one(x)
    
results = np.array([dict2numpy(online_pca.transform_one(x)) for (x,y) in stream])
plt.scatter(results[:, 0], results[:, 1],
            c=iris.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('gnuplot', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

**We do IRIS dataset clustering using Sk-learn PCA algorithm**

In [None]:

np.random.seed(5)

iris = datasets.load_iris(as_frame=True)
X = iris.data
y = iris.target
sk_pca = PCA(2)
sk_pca.fit(X)
results = sk_pca.transform(X)
    
results = np.array([dict2numpy(online_pca.transform_one(x)) for (x,y) in stream])
plt.scatter(results[:, 0], results[:, 1],
            c=iris.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('gnuplot', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

### River intergration

In [None]:
from river import linear_model
from river import metrics
import pandas as pd 
from river.stream import iter_pandas

def evaluate(stream, model, n_wait=100, verbose=True):
    def print_progress(sample_id, acc, kappa):
        print(f'Samples processed: {sample_id}')
        print(acc)
        print(kappa)

    acc = metrics.Accuracy()
    acc_rolling = metrics.Rolling(metric=metrics.Accuracy(), window_size=n_wait)
    kappa = metrics.CohenKappa()
    kappa_rolling = metrics.Rolling(metric=metrics.CohenKappa(), window_size=n_wait)
    raw_results = []
    model_name = model.__class__.__name__
    for i, (x, y) in enumerate(stream):
        # Predict
        y_pred = model.predict_one(x)
        # Update metrics and results
        acc.update(y_true=y, y_pred=y_pred)
        acc_rolling.update(y_true=y, y_pred=y_pred)
        kappa.update(y_true=y, y_pred=y_pred)
        kappa_rolling.update(y_true=y, y_pred=y_pred)
        if i % n_wait == 0 and i > 0:
            if verbose:
                print_progress(i, acc, kappa)
            raw_results.append([model_name, i, acc.get(), acc_rolling.get(), kappa.get(), kappa_rolling.get()])
        model.learn_one(x, y)
    print_progress(i, acc, kappa)
    return pd.DataFrame(raw_results, columns=['model', 'id', 'acc', 'acc_roll', 'kappa', 'kappa_roll'])


In [None]:
from sklearn.datasets import fetch_covtype
data = fetch_covtype(as_frame=True)
X ,Y = data.data[:5_000], data.target[:5_000]
stream  = iter_pandas(X,Y)
pca = PerturbPCA(10)
lin = linear_model.SoftmaxRegression()
model = pca | lin
evaluate(stream=iter_pandas(X=X, y=Y),
                      model=model, n_wait=1000)

In [None]:
X ,Y = data.data[:5_000], data.target[:5_000]
stream  = iter_pandas(X,Y)
pca = ReducedPCA(10)
lin = linear_model.SoftmaxRegression()
model = pca | lin
evaluate(stream=iter_pandas(X=X, y=Y),
                      model=model, n_wait=1000)