In [1]:
import zipfile
import re
import html
from datasets import load_dataset
from argparse import Namespace

In [2]:
# Options for 'dataset_name'
# "/data3/mmendieta/Violence_data/case_studies/Russia_Ukraine_combined_with_labels.csv"
# "/data3/mmendieta/Violence_data/case_studies/Israel_Oct7_23_combined_with_labels.csv"
# "/data3/mmendieta/Violence_data/case_studies/Trump_Capitol_Hill_combined_with_labels.csv"

config = {
    "cuda_device": 15,
    "path_to_model_on_disk": "/data3/mmendieta/models/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/XLM-T_finetuned_violence_twitter",
    "max_length": 32,
    "dataset_name": "/data3/mmendieta/Violence_data/case_studies/Russia_Ukraine_combined_with_labels.csv",
    "batch_size": 1
}

args = Namespace(**config)

In [7]:
# Load the dataset from disk (516ms)
%time ds = load_dataset('csv', data_files= args.dataset_name, sep=",")

Using custom data configuration default-b5a71faa7ad35e3f
Found cached dataset csv (/home/mmendieta/.cache/huggingface/datasets/csv/default-b5a71faa7ad35e3f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 43 ms, sys: 7.39 ms, total: 50.4 ms
Wall time: 335 ms


In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorAlias', 'AuthorID', 'CreatedDate', 'Text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 1020
    })
})

In [9]:
# rename the column 'Text' to 'text'
dataset = ds.rename_column('Text', 'text')

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorAlias', 'AuthorID', 'CreatedDate', 'text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 1020
    })
})

### Define a pre-processing function to use with datasets.map()

In [11]:
def clean_tweet(example):
    tweet = example['text']
    tweet = tweet.replace("\n"," ") #cleaning newline “\n” from the tweets
    tweet = html.unescape(tweet) # decode html characters
    tweet = re.sub("@[A-Za-z0-9_:]+","", tweet) # remove mentions
    tweet = re.sub(r'http\S+', '', tweet) # remove urls
    tweet = re.sub('RT ', '', tweet) # remove mentions
    return {'text': tweet.strip()} #strip white spaces

In [12]:
# Filter rows with blank tweets for pre-processing
%time dataset = dataset.filter(lambda x: x["text"] is not None)

  0%|          | 0/2 [00:00<?, ?ba/s]

CPU times: user 59.7 ms, sys: 3.56 ms, total: 63.3 ms
Wall time: 59 ms
