In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_from_disk
from argparse import Namespace

In [2]:
config = {
    "train_subset": 3000000,
    "valid_subset": 800000,
    "test_subset" : 400000,
    "seed": 42
}

args = Namespace(**config)

### Read the dataset

In [3]:
violence_hidden = load_from_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_labse")

In [4]:
# Remove unncesary columns
keep_cols = ['hidden_state', 'pre7geo10', 'pre7geo30', 
             'pre7geo50', 'post7geo10', 'post7geo30', 'post7geo50']
remove_columns = [col for col in violence_hidden['train'].column_names if col not in keep_cols]

In [5]:
violence_hidden = violence_hidden.remove_columns(remove_columns)

In [6]:
# Extract a subset of the dataset
train_clf_ds = violence_hidden["train"].shuffle(args.seed).select(range(args.train_subset))
validation_clf_ds = violence_hidden["validation"].shuffle(args.seed).select(range(args.valid_subset))
test_clf_ds = violence_hidden["test"].shuffle(args.seed).select(range(args.test_subset))

### Preprocess the labels

In [9]:
# We need to to cast integer labels to float in order to calculate the Binary Cross
# Entropy loss during training
from datasets import Value
new_features = train_clf_ds.features.copy()
new_features['post7geo10'] = Value(dtype='float32')
new_features['post7geo30'] = Value(dtype='float32')
new_features['post7geo50'] = Value(dtype='float32')
new_features['pre7geo10'] = Value(dtype='float32')
new_features['pre7geo30'] = Value(dtype='float32')
new_features['pre7geo50'] = Value(dtype='float32')
train_clf_ds = train_clf_ds.cast(new_features)
validation_clf_ds = validation_clf_ds.cast(new_features)
test_clf_ds = test_clf_ds.cast(new_features)

Casting the dataset:   0%|          | 0/300 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/80 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/40 [00:00<?, ?ba/s]

In [10]:
# create 'labels' columm for each split
cols = train_clf_ds.column_names
# Train split
train_clf_ds = train_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})
# Validation split
validation_clf_ds = validation_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})
# Test split
test_clf_ds = test_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})

  0%|          | 0/3000000 [00:00<?, ?ex/s]

  0%|          | 0/800000 [00:00<?, ?ex/s]

  0%|          | 0/400000 [00:00<?, ?ex/s]

In [11]:
# remove all columns except hidden_states and labels
col_names = train_clf_ds.column_names
col_names.remove("labels")
col_names.remove('hidden_state')

In [12]:
train_clf_ds = train_clf_ds.remove_columns(col_names)
validation_clf_ds = validation_clf_ds.remove_columns(col_names)
test_clf_ds = test_clf_ds.remove_columns(col_names)

In [19]:
train_clf_ds

Dataset({
    features: ['hidden_state', 'labels'],
    num_rows: 3000000
})

In [21]:
from collections import defaultdict
from datasets import DatasetDict

In [32]:
ds_complete = DatasetDict()

In [33]:
ds_complete

DatasetDict({
    
})

In [34]:
ds_complete["train"] = (train_clf_ds)
ds_complete["validation"] = (validation_clf_ds)
ds_complete["test"] = (test_clf_ds)

In [35]:
ds_complete

DatasetDict({
    train: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 3000000
    })
    validation: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 800000
    })
    test: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 400000
    })
})

In [38]:
ds_complete

DatasetDict({
    train: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 3000000
    })
    validation: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 800000
    })
    test: Dataset({
        features: ['hidden_state', 'labels'],
        num_rows: 400000
    })
})

In [37]:
ds_complete.save_to_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_labse_subset")