In [1]:
from dataset.paraphrase import paraphrase_loaders
from transformers import AutoTokenizer, AutoModel
from config import config
from tqdm.notebook import tqdm
import torch
import gc
import os

In [2]:
def create_model(model_name):
    model = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name)
    # freeze the weights
    for param in model.parameters():
        param.requires_grad = False
    return model

In [3]:
model_name = "distilbert-base-uncased"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = create_model(model_name)

In [5]:
loaders = paraphrase_loaders(
    config=config['tasks']['paraphrase'],
    tokenizer=tokenizer,
)

Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)
Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 364/364 [00:08<00:00, 44.80ba/s]
100%|██████████| 432/432 [00:08<00:00, 50.30ba/s]
100%|██████████| 50/50 [00:01<00:00, 47.61ba/s]
100%|██████████| 16/16 [00:00<00:00, 46.69ba/s]
100%|██████████| 364/364 [00:07<00:00, 49.01ba/s]
100%|██████████| 432/432 [00:08<00:00, 49.75ba/s]
100%|██████████| 50/50 [00:01<00:00, 47.89ba/s]
100%|██████████| 16/16 [00:00<00:00, 48.80ba/s]
100%|██████████| 44/44 [00:20<00:00,  2.19ba/s]
100%|██████████| 15/15 [00:06<00:00,  2.38ba/s]
100%|██████████| 44/44 [00:32<00:00,  1.34ba/s]
100%|██████████| 15/15 [00:10<00:00,  1.38ba/s]


In [6]:
for domain in loaders:
    for set in loaders[domain]:
        print(domain, len(loaders[domain][set]))

qqp 342
qqp 111
qqp 111
paws 342
paws 111
paws 111


In [9]:

data = {}

for domain in tqdm(loaders):

    representation = torch.empty((0, 768))
    labels = []

    for batch in tqdm(loaders[domain]['valid']):
        
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        representation = torch.vstack((representation, outputs[0][:, 0]))
        labels += batch['label'].tolist()

        
    data.update({
        domain:{
            "representations":representation,
            "labels":labels
        }
    })

    del representation
    gc.collect()

# del model
# gc.collect()



  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

In [18]:
batch = next(iter(loaders[domain][set]))

In [11]:
data['qqp']['representations'].shape

torch.Size([14150, 768])

In [12]:
data['paws']['representations'].shape

torch.Size([14150, 768])

### Save Representation

In [1]:
# for domain in data.keys():

#     path = os.path.join("/home/macab/research/robust-representation-learning/adaptil/representations/pca", domain)
#     os.makedirs(path, exist_ok=True)
    
#     torch.save(os.path.join(path, "representation.pt"), data[domain]['representations'])
#     with open(os.path.join(path, "labels.txt"), "w") as file:
#         file.write("\n".join(data[domain]['labels']))

### Read the Representations and Labels

In [3]:
import numpy as np

In [4]:
qqp = np.load("/home/macab/research/robust-representation-learning/adaptil/representations/pca/qqp.npy")
paws = np.load("/home/macab/research/robust-representation-learning/adaptil/representations/pca/paws.npy")

In [5]:
print(qqp.shape, paws.shape)

(6400, 768) (6400, 768)


In [7]:
qqp_labels = open("./representations/pca/qqp_labels.txt").read().splitlines()
paws_labels = open("./representations/pca/paws_labels.txt").read().splitlines()

### Apply PCA

In [48]:
from sklearn.decomposition import PCA
from contrastive import CPCA
import pandas as pd

In [51]:
pca = PCA(n_components=2, random_state=42)
cpca = CPCA(n_components=2)

In [52]:
cpca.fit_transform?

[0;31mSignature:[0m
[0mcpca[0m[0;34m.[0m[0mfit_transform[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mforeground[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbackground[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mplot[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgui[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha_selection[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_alphas[0m[0;34m=[0m[0;36m40[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_log_alpha[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_alphas_to_return[0m[0;34m=[0m[0;36m4[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactive_labels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlegend[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha_value[0m[0;34m=[0m

In [38]:
pca_qqp = pca.fit_transform(qqp)
cpca_qqp = cpca.fit_transform(qqp)

In [39]:
print(pca_qqp.shape, cpca_qqp.shape)

(6400, 2) (6400, 2)


In [40]:
qqp_df_pca = pd.DataFrame(
    data={
        "pca_0":pca_qqp[:, 0],
        "pca_1":pca_qqp[:, 1],
        "labels":qqp_labels
    }
)
qqp_df_cpca = pd.DataFrame(
    data={
        "pca_0":cpca_qqp[:, 0],
        "pca_1":cpca_qqp[:, 1],
        "labels":qqp_labels
    }
)

In [41]:
qqp_df_pca.tail(10)

Unnamed: 0,pca_0,pca_1,labels
6390,0.088113,1.362898,1
6391,-1.70867,-0.060611,1
6392,-0.287391,1.518818,0
6393,1.732156,-1.605307,1
6394,-1.681686,-0.928283,0
6395,1.285021,0.086643,0
6396,-0.310482,0.1197,1
6397,1.417964,-1.113369,1
6398,-0.642029,0.758022,0
6399,-1.32645,0.008802,0


In [42]:
qqp_df_cpca.tail(10)

Unnamed: 0,pca_0,pca_1,labels
6390,0.088113,1.362898,1
6391,-1.70867,-0.060611,1
6392,-0.287391,1.518818,0
6393,1.732156,-1.605307,1
6394,-1.681686,-0.928283,0
6395,1.285021,0.086643,0
6396,-0.310482,0.1197,1
6397,1.417964,-1.113369,1
6398,-0.642029,0.758022,0
6399,-1.32645,0.008802,0


In [43]:
cpca.fit_transform?

[0;31mSignature:[0m [0mcpca[0m[0;34m.[0m[0mfit_transform[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fit the model with X and apply the dimensionality reduction on X.

Parameters
----------
X : array-like of shape (n_samples, n_features)
    Training data, where n_samples is the number of samples
    and n_features is the number of features.

y : Ignored

Returns
-------
X_new : ndarray of shape (n_samples, n_components)
    Transformed values.

Notes
-----
This method returns a Fortran-ordered array. To convert it to a
C-ordered array, use 'np.ascontiguousarray'.
[0;31mFile:[0m      ~/miniconda3/lib/python3.8/site-packages/sklearn/decomposition/_pca.py
[0;31mType:[0m      method


In [45]:
from contrastive import CPCA

In [46]:
c = CPCA()

[0;31mInit signature:[0m [0mCPCA[0m[0;34m([0m[0mn_components[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m [0mstandardize[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Contrastive PCA (cPCA)

Linear dimensionality reduction that uses eigenvalue decomposition
to identify directions that have increased variance in the primary (foreground)
dataset relative to a secondary (background) dataset. Then, those directions
are used to project the data to a lower dimensional space.
[0;31mFile:[0m           ~/miniconda3/lib/python3.8/site-packages/contrastive/__init__.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     Kernel_CPCA
