In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
from collections import defaultdict
import matplotlib.pyplot as plt
from auggam import analyze_helper
import pickle as pkl
# df = pd.read_csv('../results/datasets_ovw.csv', index_col=0)

In [6]:
def prep_for_printing(df):
    df = df.sort_values('n_train')
    df['num_classes'] = df.pop('num_classes') # move imbalance to end
    df['imbalance'] = df.pop('imbalance') # move imbalance to end
    df = df.infer_objects()
    for i in range(len(df.columns)):
        col_name = df.columns[i]
        if not 'imbalance' in col_name:
            df[col_name] = df[col_name].astype(int)
        else:
            df[col_name] = df[col_name].round(2).astype(str)
    df = df.rename(
        columns=analyze_helper.COLUMNS_RENAME_DICT,
        index=analyze_helper.DSETS_RENAME_DICT,
    ).sort_index()
        
    return df

pd.options.display.float_format = '{:,}'.format
col_order = ['Financial phrasebank', 'Rotten tomatoes', 'SST2', 'Emotion', 'Tweet (Hate)']
print(prep_for_printing(df).transpose().to_latex())

\begin{tabular}{llllll}
\toprule
{} & Emotion & Financial phrasebank & Rotten tomatoes &    SST2 & Tweet (Hate) \\
\midrule
Samples (train)         &   16000 &                 2313 &            8530 &   67349 &         9000 \\
Samples (val)           &    2000 &                 1140 &            1066 &     872 &         1000 \\
Unigrams                &   15165 &                 7169 &           16631 &   13887 &        18477 \\
Bigrams                 &  106201 &                28481 &           93921 &   72501 &       106277 \\
Trigrams                &  201404 &                39597 &          147426 &  108800 &       171768 \\
Classes                 &       6 &                    3 &               2 &       2 &            2 \\
Majority class fraction &    0.34 &                 0.62 &             0.5 &    0.56 &         0.58 \\
\bottomrule
\end{tabular}



  print(prep_for_printing(df).transpose().to_latex())


In [10]:
prep_for_printing(df).transpose()[]

Unnamed: 0,Financial phrasebank,Rotten tomatoes,SST2,Emotion,Tweet (Hate)
Samples (train),2313.0,8530.0,67349.0,16000.0,9000.0
Samples (val),1140.0,1066.0,872.0,2000.0,1000.0
Unigrams,7169.0,16631.0,13887.0,15165.0,18477.0
Bigrams,28481.0,93921.0,72501.0,106201.0,106277.0
Trigrams,39597.0,147426.0,108800.0,201404.0,171768.0
Classes,3.0,2.0,2.0,6.0,2.0
Majority class fraction,0.62,0.5,0.56,0.34,0.58


**Print info about counts (manually copied this into the table)**

In [None]:
counts = pkl.load(open('results/datasets_ovw.pkl', 'rb'))
def plot_counts(counts):
    x = np.array(counts[0].tolist()).squeeze()
    plt.hist(x, bins=100)
    plt.yscale('log')
    plt.xlabel('Count of occurences of trigram in training dataset')
    plt.ylabel('Count of trigrams')
# plot_counts(counts['emotion_trigram'])
for dset_name in df.index.values:
    counts_dset = np.array(counts[dset_name + '_trigram']).squeeze()
    print(f'{(counts_dset == 1).sum() /  len(counts_dset):0.2f}', end = ' & ')

In [39]:
import datasets
d = datasets.load_dataset('super_glue', 'cb')

Downloading and preparing dataset super_glue/cb to /home/chansingh/.cache/huggingface/datasets/super_glue/cb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/75.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /home/chansingh/.cache/huggingface/datasets/super_glue/cb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [40]:
d

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 56
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
})

In [36]:
dt = d['train']

In [37]:
np.unique(dt['coarse_label'])

array([0, 1, 2, 3, 4, 5])

In [15]:
dts

{'text': ['Measured Fed Rate Hike Sends Stocks Up  NEW YORK (Reuters) - U.S. stocks rose on Wednesday after  the Federal Reserve reassured by indicating that further  interest-rate rises were likely to remain at a "measured" pace,  as it increased rates by a quarter point.',
  'Judge in London gives okay to let premature baby die LONDON : Doctors caring for a critically-ill premature baby, Charlotte Wyatt, were given permission by a British judge to allow her to die if her condition seriously deteriorates and her breathing stops.',
  'Astros Beat Cards, Take NLCS Series Lead HOUSTON - On a night when pitching suddenly took over the NL playoffs, someone was bound to get a hit. Fortunately for the Houston Astros, Jeff Kent stepped up...',
  'NVIDIA Is Vindicated The graphics chip company recaptures the high-end segment and quadruples profits.',
  'Apple launches 60GB, 40GB iPod Photo At a special music event featuring Bono and The Edge from U2, Apple yesterday unveiled the iPod Photo wit