In [None]:
import torch
torch.cuda.device_count()

In [1]:
from datasets import load_from_disk
import torch

### Preprocess the dataset

In [2]:
ds = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_datasets")

In [None]:
# Peek at one sample
ds["train"][0]

Since this is a multi-label classification problem, there are 6 labels = ('pre7geo10', 'pre7geo30', 'pre7geo50', 'post7geo10', 'post7geo30', 'post7geo50')

In [3]:
# Remove unncesary columns
keep_cols = ['tweetid', 'text', 'geo_x', 'geo_y', 'lang', 'pre7geo10', 'pre7geo30', 
             'pre7geo50', 'post7geo10', 'post7geo30', 'post7geo50']
remove_columns = [col for col in ds['train'].column_names if col not in keep_cols]

In [4]:
ds = ds.remove_columns(remove_columns)

In [5]:
ds["train"].features

{'tweetid': Value(dtype='int64', id=None),
 'geo_x': Value(dtype='float64', id=None),
 'geo_y': Value(dtype='float64', id=None),
 'lang': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'post7geo10': Value(dtype='int64', id=None),
 'post7geo30': Value(dtype='int64', id=None),
 'post7geo50': Value(dtype='int64', id=None),
 'pre7geo10': Value(dtype='int64', id=None),
 'pre7geo30': Value(dtype='int64', id=None),
 'pre7geo50': Value(dtype='int64', id=None)}

In [6]:
# We need to to cast integer labels to float in order to calculate the Binary Cross
# Entropy loss during training
from datasets import Value
new_features = ds["train"].features.copy()
new_features['tweetid'] = Value(dtype='string')  # cast this value to integer to avoid errors
new_features['post7geo10'] = Value(dtype='float32')
new_features['post7geo30'] = Value(dtype='float32')
new_features['post7geo50'] = Value(dtype='float32')
new_features['pre7geo10'] = Value(dtype='float32')
new_features['pre7geo30'] = Value(dtype='float32')
new_features['pre7geo50'] = Value(dtype='float32')
ds["train"] = ds["train"].cast(new_features)
ds["validation"] = ds["validation"].cast(new_features)
ds["test"] = ds["test"].cast(new_features)

Loading cached processed dataset at /data3/mmendieta/Violence_data/geo_corpus.0.0.1_datasets/train/cache-8b2a0d7a6205e985.arrow
Loading cached processed dataset at /data3/mmendieta/Violence_data/geo_corpus.0.0.1_datasets/validation/cache-85273bc4be4fa313.arrow
Loading cached processed dataset at /data3/mmendieta/Violence_data/geo_corpus.0.0.1_datasets/test/cache-36890463b6b802a7.arrow


In [7]:
ds["train"].features

{'tweetid': Value(dtype='string', id=None),
 'geo_x': Value(dtype='float64', id=None),
 'geo_y': Value(dtype='float64', id=None),
 'lang': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'post7geo10': Value(dtype='float32', id=None),
 'post7geo30': Value(dtype='float32', id=None),
 'post7geo50': Value(dtype='float32', id=None),
 'pre7geo10': Value(dtype='float32', id=None),
 'pre7geo30': Value(dtype='float32', id=None),
 'pre7geo50': Value(dtype='float32', id=None)}

In [8]:
# This cell takes approximately 4 min to run
# It is important that the labels are float in order to calculate Binary Cross Entropy loss
# create 'labels' columm

# Define columns to ignore
ignore_columns = ["tweetid", "geo_x", "geo_y", "lang", "text"]

# Filter to only work on the test set
cols = [col for col in ds["test"].column_names if col not in ignore_columns]

# Map function to create labels
ds["test"] = ds["test"].map(lambda x: {"labels": [x[c] for c in cols]}, remove_columns=cols)

ds['test']                                  

  0%|          | 0/2329158 [00:00<?, ?ex/s]

Dataset({
    features: ['tweetid', 'geo_x', 'geo_y', 'lang', 'text', 'labels'],
    num_rows: 2329158
})

In [9]:
ds["test"][0]

{'tweetid': '388328898662268928',
 'geo_x': 35.49442,
 'geo_y': 33.888940000000005,
 'lang': 'en',
 'text': 'talking abt my case ☺️',
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [10]:
ds['test'].save_to_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_inference")