In [1]:
# Load the "autoreload" extension so that code can change
%load_ext autoreload

# Always reload modules so that as you change code in src, it gets loaded
%autoreload 2


# Load the data for Spacy's TextCategorizer

### Import statements and logging initiation

In [2]:
import logging
import pandas as pd
import random
import thinc.extra.datasets
from src.data import make_dataset

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info('making final data set from raw data')

INFO:root:making final data set from raw data


### Extract the Yahoo! dataset

In [3]:
%%time
# load the data
train,test,classes=make_dataset.untar("../data/raw/yahoo_answers_csv.tar.gz")
data_train = make_dataset.straighten(train)
data_test  = make_dataset.straighten(test )
# print statistics
logging.info(f"Number of training samples: {len(data_train):10}")
logging.info(f"Number of test     samples: {len(data_test ):10}")

INFO:root:Number of training samples:    3543746
INFO:root:Number of test     samples:     151860
Wall time: 36 s


### Class balancing

The dataset is almost already balanced, with the exception of the missing entries.

In [4]:
train.groupby(['classid']).count()

Unnamed: 0_level_0,title,qtext,answer,class
classid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,140000,86080,138700,140000
2,140000,67706,139991,140000
3,140000,79640,136996,140000
4,140000,69812,137633,140000
5,140000,78944,134149,140000
6,140000,65922,139890,140000
7,140000,64038,137916,140000
8,140000,81536,137577,140000
9,140000,92409,133902,140000
10,140000,82238,138667,140000


### Compare figures with the thinc dataset

In [5]:
thinc_data, _ = thinc.extra.datasets.imdb()           # TODO: Replace with text extraction: list of tuples (text,class) <-- not needed
logging.info(f"Number of training samples: {len(thinc_data):10}")

INFO:root:Number of training samples:      25000


### Restrict the training set to a random subset

Seed the random call with `SEED_TRAINING_SET_SAMPLER` to allow repetitive training.

Also set the training set size `limit` to enforce the same dataset size as in spacy's example.

In [6]:
limit=25000

In [7]:
# Partition off part of the train data for evaluation
random.shuffle(thinc_data)                            # TODO: Add seed control
thinc_data = thinc_data[-limit:]

In [8]:
SEED_TRAINING_SET_SAMPLER = 101
# Create a seedable random shuffler
random_shuffler = random.Random(SEED_TRAINING_SET_SAMPLER)
# Combine texts with their categories to shuffle them together, then sumsample a random training set
# TODO: replace this subsampling step with a balanced sampling from each class
random_shuffler.shuffle( data_train )
data_train = data_train[-limit:]

### Create the categories dictionary for each sample. 

In [25]:
# Transform the 1D arrays X into lists and the Y array into categories, using the classes list [{"CLASS1":1,"CLASS2":2,...}]
texts, labels = zip(*thinc_data)
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]      # TODO: Replace with classes: list of dict with true/false value for each class

In [10]:
# Transform the 1D arrays X into lists and the Y array into categories, using the classes list [{"CLASS1":1,"CLASS2":2,...}]
texts, labels = zip(*data_train)
cats=[ pd.DataFrame(classes['classid']==y,index=classes['class']).to_dict()['classid'] for y in labels]

Verify the categories list for text 32.

In [11]:
cats[32], labels[32]

({'Society & Culture': nan,
  'Science & Mathematics': nan,
  'Health': nan,
  'Education & Reference': nan,
  'Computers & Internet': nan,
  'Sports': nan,
  'Business & Finance': nan,
  'Entertainment & Music': nan,
  'Family & Relationships': nan,
  'Politics & Government': nan},
 8)

### Split training set into training and validation sets.

In [15]:
split=0.8

In [16]:
split = int(len(thinc_data) * split)
# return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [30]:
split=0.8

In [31]:
split = int(len(data_train) * split)
# return (texts[:split], cats[:split]), (texts[split:], cats[split:])

### Put it all together

In [14]:
# load_data()