In [3]:
import pytest
import re, os

import torch
from torch.utils.data import DataLoader
import torchvision

import modlee
from modlee import data_metafeatures as dmf
from modlee.utils import text_loaders, image_loaders

import pandas as pd
import spacy

DATA_ROOT = os.path.expanduser("~/efs/.data")
IMAGE_DATALOADER = modlee.utils.get_imagenette_dataloader()
# TEXT_DATALOADER = modlee.utils.get_wnli_dataloader() 


TEXT_LOADERS = {loader_fn:getattr(text_loaders, loader_fn) for loader_fn in dir(text_loaders) if re.match('get_(.*)_dataloader', loader_fn)}
IMAGE_LOADERS = [getattr(image_loaders, loader_fn) for loader_fn in dir(image_loaders) if re.match('get_(.*)_dataloader', loader_fn)]


In [2]:
import pandas as pd
df = None
features = []
labels = []
for text_loader_name,text_loader_fn in TEXT_LOADERS.items():
    print(text_loader_name)
    # for _ in range(10):
    for _ in range(1):
        text_mf = dmf.TextDataMetafeatures(text_loader_fn(), testing=True)
        features.append({
            **text_mf.embedding,
            **text_mf.mfe,
            **text_mf.properties
        })
        labels.append(text_loader_name)
        # features.append(text_mf.)


get_cola_dataloader


  embds = torch.Tensor(embds)


get_mnli_dataloader
get_qnli_dataloader
get_rte_dataloader
get_sst2_dataloader
get_stsb_dataloader
get_wnli_dataloader


In [3]:
list(TEXT_LOADERS.values())[0].args[0]

'CoLA'

In [4]:
dir(list(TEXT_LOADERS.values())[1])
list(TEXT_LOADERS.values())[1].__name__

'get_mnli_dataloader'

In [5]:
mf_global = None
def get_df_from_loaders(loaders, modality, n_samples=1):
    global mf_global
    if isinstance(loaders, dict):
        loaders = list(loaders.values())
    df = pd.DataFrame()
    print(loaders)
    features = []
    MFClass = getattr(dmf, f"{modality.capitalize()}DataMetafeatures")
    for loader_fn in loaders:
        for _ in range(n_samples):
            metafeatures = MFClass(
                loader_fn(root=DATA_ROOT), testing=True
            )
            if hasattr(loader_fn, 'args'):
                dataset_name = loader_fn.args[0]
            else:
                dataset_name = loader_fn.__name__
            mf_global = metafeatures
            features.append({
                    'dataset_name':dataset_name,
                    **metafeatures.embedding,
                    **metafeatures.mfe,
                    **metafeatures.properties,
            })
            pd.DataFrame(features).to_csv(
                f'./{modality}_features_cache.csv')
    df = pd.DataFrame(features)
    return df



In [6]:
text_df = get_df_from_loaders(TEXT_LOADERS, 'text', n_samples=20)

[functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'CoLA', 527), <function text_loaders.get_mnli_dataloader at 0x7f02c3f1f640>, functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'QNLI', 5463), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'RTE', 277), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'SST2', 872), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'STSB', 1500), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f02c3f1f5b0>)>, 'WNLI', 71)]


  embds = torch.Tensor(embds)


In [9]:
# cached_df = pd.read_csv('./image_features_cache_0.csv')

class DFTransforms:
    @staticmethod
    def list_cols2item(df):
        object_columns = df.select_dtypes(include=['object']).columns
        df[object_columns] = df[object_columns].apply(
            lambda x : x[0]
        )
        return df
    
    @staticmethod
    def drop_nonnum(df):
        return df.select_dtypes(include=['float','int'])
        
    @staticmethod
    def fillna(df, val=0):
        return df.fillna(val)
    
    @staticmethod
    def dropna(df):
        return df.dropna(axis=1, how='any')

    @staticmethod
    def normalize(df):
        def min_max_normalize(column):
            return (column - column.min()) / (column.max() - column.min())
        return df.apply(min_max_normalize)

    @staticmethod
    def compose(transforms):
        def apply_transforms(df):
            for transform in transforms:
                df = transform(df)
            return df
        return apply_transforms
df_transforms = DFTransforms.compose([
    DFTransforms.list_cols2item,
    DFTransforms.drop_nonnum,
    DFTransforms.normalize,
    DFTransforms.dropna,
])
def save_labels(df, fn):
    with open(fn,'w') as _file:
        _file.write('\n'.join(list(df['dataset_name'])))

def save_tsv(df, fn):
    return df.to_csv(
        fn,
        sep='\t',
        index=False,
        header=False 
    )


In [None]:

cached_df = pd.read_csv('./text_features_cache.csv')
labels = list(cached_df['dataset_name'])
save_labels(cached_df, './text_labels.txt')
cached_df = df_transforms(cached_df)
print(labels)
save_tsv(cached_df, 'cached_text_metafeatures.tsv')
# cached_df = DFTransforms.list_cols2item(cached_df)

In [7]:
image_df = get_df_from_loaders(IMAGE_LOADERS, 'image', n_samples=4)

[functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'CIFAR10', train=False, download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'DTD', split='test', download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'EuroSAT', download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'FashionMNIST', train=False, download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'FGVCAircraft', split='test', download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'Flowers102', split='test', download=True), functools.partial(<staticmethod(<function image_loaders._get_image_dataloader at 0x7f168ef17490>)>, 'GTSRB', split='test', download=True), functools.partial(<st

In [42]:
image_df

Unnamed: 0,dataset_name,embd_0_mean_0,embd_0_mean_1,embd_0_mean_2,embd_0_mean_3,embd_0_mean_4,embd_0_mean_5,embd_0_mean_6,embd_0_mean_7,embd_0_mean_8,...,sparsity.sd_1,t_mean.mean_1,t_mean.sd_1,var.mean_1,var.sd_1,dataset_size,elem_0_shape,elem_0_dims,elem_1_shape,elem_1_dims
0,CIFAR10,-0.016777,0.681486,-1.257155,-0.762495,-1.302225,1.646449,-1.796577,-2.000172,-1.442675,...,,4.783333,,8.616162,,10000,"[100, 3, 300, 300]",4,[100],1
1,DTD,-1.058883,0.862001,0.060573,0.420286,0.459545,0.266008,0.72338,-0.35825,-0.699077,...,,21.4,,170.142525,,1888,"[100, 3, 300, 300]",4,[100],1


In [39]:
mf_dict = {
    # **mf_global.embedding,
    # **mf_global.mfe,
    **mf_global.properties
}
pd.DataFrame(mf_dict,)
print(mf_dict)

ValueError: All arrays must be of the same length

In [22]:
text_df['skewness.mean_0']

0    5.664189
1    0.090764
2    0.198000
3   -0.078866
4    9.513864
5    1.288083
6    0.530463
Name: skewness.mean_0, dtype: float64

In [3]:
df = pd.DataFrame(features)
# print(len(TEXT_LOADERS))
df = df.fillna(0)

In [4]:
# print(df.dtypes)
import numpy as np
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].apply(
    lambda x : x[0]
)
df.to_csv('text_metafeatures.tsv', sep='\t', index=False, header=False)

In [5]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

# Normalize DataFrame by columns
normalized_df = df.apply(min_max_normalize)
normalized_df.to_csv(
    'text_metafeatures_normalized.tsv', 
    sep='\t', 
    index=False,
    header=False
    )
with open("data_labels.txt",'w') as _file:
    _file.write('\n'.join(labels))
    # _file.write('\n'.join(list(TEXT_LOADERS.keys())))

In [13]:
embd_cols = sorted(col for col in normalized_df.columns if 'embd' in col)
print(embd_cols)
normalized_df[embd_cols].to_csv(
    'text_metafeatures_normalized_embd.tsv',
    sep='\t',
    index=False,
    header=False
)
normalized_df.drop(columns=embd_cols).to_csv(
    'text_metafeatures_normalized_mfe.tsv',
    sep='\t',
    index=False,
    header=False
)

['embd_mean_0', 'embd_mean_1', 'embd_mean_10', 'embd_mean_11', 'embd_mean_12', 'embd_mean_13', 'embd_mean_14', 'embd_mean_15', 'embd_mean_16', 'embd_mean_17', 'embd_mean_18', 'embd_mean_19', 'embd_mean_2', 'embd_mean_20', 'embd_mean_21', 'embd_mean_22', 'embd_mean_23', 'embd_mean_24', 'embd_mean_25', 'embd_mean_26', 'embd_mean_27', 'embd_mean_28', 'embd_mean_29', 'embd_mean_3', 'embd_mean_30', 'embd_mean_31', 'embd_mean_32', 'embd_mean_33', 'embd_mean_34', 'embd_mean_35', 'embd_mean_36', 'embd_mean_37', 'embd_mean_38', 'embd_mean_39', 'embd_mean_4', 'embd_mean_40', 'embd_mean_41', 'embd_mean_42', 'embd_mean_43', 'embd_mean_44', 'embd_mean_45', 'embd_mean_46', 'embd_mean_47', 'embd_mean_48', 'embd_mean_49', 'embd_mean_5', 'embd_mean_50', 'embd_mean_51', 'embd_mean_52', 'embd_mean_53', 'embd_mean_54', 'embd_mean_55', 'embd_mean_56', 'embd_mean_57', 'embd_mean_58', 'embd_mean_59', 'embd_mean_6', 'embd_mean_60', 'embd_mean_61', 'embd_mean_62', 'embd_mean_63', 'embd_mean_64', 'embd_mean_65'

In [51]:
!code ./text_metafeatures.tsv

In [52]:
print(list(TEXT_LOADERS.keys()), sep='\n')

['get_cola_dataloader', 'get_mnli_dataloader', 'get_qnli_dataloader', 'get_rte_dataloader', 'get_sst2_dataloader', 'get_stsb_dataloader', 'get_wnli_dataloader']
