In [2]:
import pytest
import re, os

import torch
from torch.utils.data import DataLoader
import torchvision

import modlee
from modlee import data_metafeatures as dmf
from modlee.utils import text_loaders, image_loaders

import pandas as pd
import spacy

DATA_ROOT = os.path.expanduser("~/efs/.data")
IMAGE_DATALOADER = modlee.utils.get_imagenette_dataloader()
# TEXT_DATALOADER = modlee.utils.get_wnli_dataloader() 


TEXT_LOADERS = {loader_fn:getattr(text_loaders, loader_fn) for loader_fn in dir(text_loaders) if re.match('get_(.*)_dataloader', loader_fn)}
IMAGE_LOADERS = [getattr(image_loaders, loader_fn) for loader_fn in dir(image_loaders) if re.match('get_(.*)_dataloader', loader_fn)]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mf_global = None
def get_df_from_loaders(loaders, modality, n_samples=1):
    global mf_global
    if isinstance(loaders, dict):
        loaders = list(loaders.values())
    df = pd.DataFrame()
    print(loaders)
    features = []
    MFClass = getattr(dmf, f"{modality.capitalize()}DataMetafeatures")
    for loader_fn in loaders:
        for _ in range(n_samples):
            metafeatures = MFClass(
                loader_fn(root=DATA_ROOT), testing=True
            )
            if hasattr(loader_fn, 'args'):
                dataset_name = loader_fn.args[0]
            else:
                dataset_name = loader_fn.__name__
            mf_global = metafeatures
            features.append({
                    'dataset_name':dataset_name,
                    **metafeatures.embedding,
                    **metafeatures.mfe,
                    **metafeatures.properties,
            })
            pd.DataFrame(features[-1]).to_csv(
                f'./{modality}_features_cache.csv',
                mode='a')
    df = pd.DataFrame(features)
    return df



In [6]:
text_df = get_df_from_loaders(TEXT_LOADERS, 'text')

[functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'CoLA', 527), <function text_loaders.get_mnli_dataloader at 0x7f168ef175b0>, functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'QNLI', 5463), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'RTE', 277), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'SST2', 872), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'STSB', 1500), functools.partial(<staticmethod(<function text_loaders._get_text_dataloader at 0x7f168ef17520>)>, 'WNLI', 71)]


In [None]:
image_df = get_df_from_loaders(IMAGE_LOADERS[17:], 'image', n_samples=4)

In [42]:
image_df

Unnamed: 0,dataset_name,embd_0_mean_0,embd_0_mean_1,embd_0_mean_2,embd_0_mean_3,embd_0_mean_4,embd_0_mean_5,embd_0_mean_6,embd_0_mean_7,embd_0_mean_8,...,sparsity.sd_1,t_mean.mean_1,t_mean.sd_1,var.mean_1,var.sd_1,dataset_size,elem_0_shape,elem_0_dims,elem_1_shape,elem_1_dims
0,CIFAR10,-0.016777,0.681486,-1.257155,-0.762495,-1.302225,1.646449,-1.796577,-2.000172,-1.442675,...,,4.783333,,8.616162,,10000,"[100, 3, 300, 300]",4,[100],1
1,DTD,-1.058883,0.862001,0.060573,0.420286,0.459545,0.266008,0.72338,-0.35825,-0.699077,...,,21.4,,170.142525,,1888,"[100, 3, 300, 300]",4,[100],1


In [3]:
# cached_df = pd.read_csv('./image_features_cache_0.csv')

class DFTransforms:
    @staticmethod
    def list_cols2item(df):
        object_columns = df.select_dtypes(include=['object']).columns
        df[object_columns] = df[object_columns].apply(
            lambda x : x[0]
        )
        return df
    
    @staticmethod
    def drop_nonnum(df):
        return df.select_dtypes(include=['float','int'])
        
    @staticmethod
    def fillna(df, val=0):
        return df.fillna(val)
    
    @staticmethod
    def dropna(df):
        return df.dropna(axis=1, how='any')

    @staticmethod
    def normalize(df):
        def min_max_normalize(column):
            return (column - column.min()) / (column.max() - column.min())
        return df.apply(min_max_normalize)

    @staticmethod
    def compose(transforms):
        def apply_transforms(df):
            for transform in transforms:
                df = transform(df)
            return df
        return apply_transforms
df_transforms = DFTransforms.compose([
    DFTransforms.list_cols2item,
    DFTransforms.drop_nonnum,
    DFTransforms.normalize,
    DFTransforms.dropna,
])
def save_labels(df, fn):
    with open(fn,'w') as _file:
        _file.write('\n'.join(list(df['dataset_name'])))

def save_tsv(df, fn):
    return df.to_csv(
        fn,
        sep='\t',
        index=False,
        header=False 
    )


In [4]:
modality = 'image'
cached_df = pd.read_csv(f'./{modality}_features_cache_0.csv')
save_labels(cached_df, f'./{modality}_labels.txt')
save_tsv(df_transforms(cached_df), f'cached_{modality}_metafeatures.tsv')
# cached_df = DFTransforms.list_cols2item(cached_df)

  df[object_columns] = df[object_columns].apply(


In [29]:
class MFDF(pd.DataFrame):
    @property
    def name(self):
        return f'{self=}'.partition('=')[0]

    def save_labels(self, *args, **kwargs):
        save_labels(self, *args, **kwargs)
        
    def save_tsv(self, *args, **kwargs):
        save_tsv(self, *args, **kwargs)
    

In [None]:
text_df = MFDF(pd.read_csv('./text_features_cache.csv'))
image_df = MFDF(pd.read_csv('./image_features_cache.csv'))
concat_df = MFDF(pd.concat([text_df, image_df], ignore_index=True))
# text_df.save_labels('./labels_test.txt')
print(concat_df)
print(concat_df.name)
concat_df.save_labels()

     Unnamed: 0 dataset_name  embd_mean_0  embd_mean_1  embd_mean_2  \
0           0.0         CoLA    -1.504042    -0.279320    -0.730904   
1           1.0         CoLA    -1.483405    -0.345227    -0.737107   
2           2.0         CoLA    -1.457978    -0.311558    -0.783566   
3           3.0         CoLA    -1.485049    -0.297812    -0.815689   
4           4.0         CoLA    -1.479797    -0.272896    -0.860687   
..          ...          ...          ...          ...          ...   
381        15.0      SEMEION          NaN          NaN          NaN   
382        16.0        STL10          NaN          NaN          NaN   
383        17.0        STL10          NaN          NaN          NaN   
384        18.0        STL10          NaN          NaN          NaN   
385        19.0        STL10          NaN          NaN          NaN   

     embd_mean_3  embd_mean_4  embd_mean_5  embd_mean_6  embd_mean_7  ...  \
0       0.314802    -0.043355     0.500565    -0.265383     1.232647  

In [28]:
dir(concat_df)
f'{concat_df=}'.partition('=')[0]
# print(str(concat_df))

'concat_df'

In [18]:

text_df = MFDF(df_transforms(text_df))
text_df.save_tsv('./test_text.tsv')
print(text_df)

     Unnamed: 0  embd_mean_0  embd_mean_1  embd_mean_2  embd_mean_3  \
0      0.000000     0.029311     0.334684     0.186214     0.748641   
1      0.007194     0.041228     0.128451     0.180140     0.661165   
2      0.014388     0.055911     0.233806     0.134646     0.701144   
3      0.021583     0.040279     0.276820     0.103191     0.702098   
4      0.028777     0.043311     0.354784     0.059127     0.905121   
..          ...          ...          ...          ...          ...   
135    0.971223     0.886459     0.824588     0.909996     0.061311   
136    0.978417     0.887985     0.883850     0.912975     0.080742   
137    0.985612     0.889432     0.874561     0.930662     0.051666   
138    0.992806     0.868581     0.836503     0.925419     0.043878   
139    1.000000     0.890073     1.000000     0.920951     0.079105   

     embd_mean_4  embd_mean_5  embd_mean_6  embd_mean_7  embd_mean_8  ...  \
0       0.383497     0.806467     0.106599     0.928309     0.339325  

In [9]:
print(text_df)

     Unnamed: 0  dataset_name            embd_mean_0           embd_mean_1  \
0           0.0          CoLA    -1.5040420293807983  -0.27932000160217285   
1           1.0          CoLA    -1.4834048748016357   -0.3452269434928894   
2           2.0          CoLA    -1.4579777717590332   -0.3115580379962921   
3           3.0          CoLA     -1.485048532485962   -0.2978118360042572   
4           4.0          CoLA    -1.4797974824905396    -0.272896409034729   
..          ...           ...                    ...                   ...   
163         0.0          SST2    0.13386693596839905  -0.15580838918685913   
164         NaN  dataset_name            embd_mean_0           embd_mean_1   
165         0.0          STSB  -0.029670491814613342  -0.17812450230121613   
166         NaN  dataset_name            embd_mean_0           embd_mean_1   
167         0.0          WNLI  -0.045235227793455124  -0.11147458106279373   

              embd_mean_2           embd_mean_3            embd

In [3]:
df = pd.DataFrame(features)
# print(len(TEXT_LOADERS))
df = df.fillna(0)

In [4]:
# print(df.dtypes)
import numpy as np
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].apply(
    lambda x : x[0]
)
df.to_csv('text_metafeatures.tsv', sep='\t', index=False, header=False)

In [5]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

# Normalize DataFrame by columns
normalized_df = df.apply(min_max_normalize)
normalized_df.to_csv(
    'text_metafeatures_normalized.tsv', 
    sep='\t', 
    index=False,
    header=False
    )
with open("data_labels.txt",'w') as _file:
    _file.write('\n'.join(labels))
    # _file.write('\n'.join(list(TEXT_LOADERS.keys())))

In [13]:
embd_cols = sorted(col for col in normalized_df.columns if 'embd' in col)
print(embd_cols)
normalized_df[embd_cols].to_csv(
    'text_metafeatures_normalized_embd.tsv',
    sep='\t',
    index=False,
    header=False
)
normalized_df.drop(columns=embd_cols).to_csv(
    'text_metafeatures_normalized_mfe.tsv',
    sep='\t',
    index=False,
    header=False
)

['embd_mean_0', 'embd_mean_1', 'embd_mean_10', 'embd_mean_11', 'embd_mean_12', 'embd_mean_13', 'embd_mean_14', 'embd_mean_15', 'embd_mean_16', 'embd_mean_17', 'embd_mean_18', 'embd_mean_19', 'embd_mean_2', 'embd_mean_20', 'embd_mean_21', 'embd_mean_22', 'embd_mean_23', 'embd_mean_24', 'embd_mean_25', 'embd_mean_26', 'embd_mean_27', 'embd_mean_28', 'embd_mean_29', 'embd_mean_3', 'embd_mean_30', 'embd_mean_31', 'embd_mean_32', 'embd_mean_33', 'embd_mean_34', 'embd_mean_35', 'embd_mean_36', 'embd_mean_37', 'embd_mean_38', 'embd_mean_39', 'embd_mean_4', 'embd_mean_40', 'embd_mean_41', 'embd_mean_42', 'embd_mean_43', 'embd_mean_44', 'embd_mean_45', 'embd_mean_46', 'embd_mean_47', 'embd_mean_48', 'embd_mean_49', 'embd_mean_5', 'embd_mean_50', 'embd_mean_51', 'embd_mean_52', 'embd_mean_53', 'embd_mean_54', 'embd_mean_55', 'embd_mean_56', 'embd_mean_57', 'embd_mean_58', 'embd_mean_59', 'embd_mean_6', 'embd_mean_60', 'embd_mean_61', 'embd_mean_62', 'embd_mean_63', 'embd_mean_64', 'embd_mean_65'

In [51]:
!code ./text_metafeatures.tsv

In [52]:
print(list(TEXT_LOADERS.keys()), sep='\n')

['get_cola_dataloader', 'get_mnli_dataloader', 'get_qnli_dataloader', 'get_rte_dataloader', 'get_sst2_dataloader', 'get_stsb_dataloader', 'get_wnli_dataloader']


In [5]:
import torchvision
from pymfe.mfe import MFE
rn18 = torchvision.models.resnet18()
extractor = MFE()
rn18_features = extractor.extract_from_model(
    rn18
)

TypeError: 'model' from type '<class 'torchvision.models.resnet.ResNet'>' not supported. Currently only supporting classes: [<class 'sklearn.tree._classes.DecisionTreeClassifier'>].

In [3]:
import test_model_metafeatures

ImportError: attempted relative import with no known parent package