In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torchvision
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# manage ray's relative imports
# import ray
# runtime_env = {"working_dir": ".." }
# ray.init(runtime_env=runtime_env, dashboard_port=13065, include_dashboard=True)

from ray import tune
from ray.tune.suggest.optuna import OptunaSearch
from ray.tune import JupyterNotebookReporter

# manage beams's relative imports
import sys
sys.path.append('..')

from src.beam import beam_arguments, Experiment, Study
from src.beam import UniversalDataset, UniversalBatchSampler, PackedFolds
from src.beam import Algorithm
from src.beam import LinearNet, check_type, slice_to_index
from torchvision import transforms
import matplotlib.pyplot as plt

from src.beam import DataTensor
from src.beam.utils import is_notebook

from sklearn.datasets import fetch_covtype
import pandas as pd

In [2]:
dataset = fetch_covtype()

In [3]:
data = dataset['data']
columns = dataset['feature_names']
y = dataset['target']

In [11]:
df = pd.DataFrame(data=data, columns=columns, index=np.arange(len(data)))

soils_columns = [c for c in df.columns if 'Soil' in c]

soil = np.where(df[soils_columns])[1]

wilderness_columns = [c for c in df.columns if 'Wilderness' in c]

wilderness = np.where(df[wilderness_columns])[1]

df_cat = pd.DataFrame({'Soil': soil, 'Wilderness': wilderness})

df_num = df.drop(columns=(soils_columns+wilderness_columns))

covtype = pd.concat([df_num, df_cat], axis=1)

In [12]:
dataset = UniversalDataset(x=covtype.values, y=y)

In [13]:
dataset.split(validation=.2, test=.2, seed=5782, stratify=True, labels=dataset.data['y'])

In [37]:
dataset.build_samplers(128, 128, oversample=True, weight_factor=.5)

In [38]:
dataloaders = dataset.build_dataloaders()

In [39]:
pd.Series(dataset.data['y'][dataset.indices_split['validation']]).value_counts(normalize=True)

2    0.487595
1    0.364603
3    0.061539
7    0.035300
6    0.029896
5    0.016342
4    0.004724
dtype: float64

In [40]:
1428498 / 549

2602.0

In [41]:
pd.Series(dataset.data['y'][dataset.samplers['validation'].indices]).value_counts(normalize=True)

2    0.328986
1    0.288417
3    0.118120
7    0.089110
6    0.082076
5    0.060644
4    0.032646
dtype: float64

In [25]:
len(dataset.samplers['validation'].indices)

9999068

In [16]:
from src.beam.model import BetterEmbedding

In [19]:
be = BetterEmbedding(torch.arange(10), torch.arange(10,12), 20, torch.tensor(df_cat.nunique().values), 128)

In [21]:
be(dataset[1:3]['x']).shape

torch.Size([2, 12, 128])