### VM setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd "/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017/"

/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017


In [3]:
!pip install transformers==4.17

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 41.4 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.4 MB/s 
Building wheels for collected packages: s

In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 4.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 11.6 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 46.6 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 53.1 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |█████████████

## Download git repo from github

In [10]:
!bash ../clone_repo.sh

Cloning into '/content/experiments_chd'...
remote: Enumerating objects: 160, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 160 (delta 1), reused 0 (delta 0), pack-reused 151[K
Receiving objects: 100% (160/160), 24.59 KiB | 94.00 KiB/s, done.
Resolving deltas: 100% (78/78), done.


In [11]:
!bash ../pull_repo.sh
%cd "/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017/"

Already up to date.
/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017


### Imports

In [16]:
import sys
sys.path.insert(0, "/content/experiments_chd/data_preparation/")

In [23]:
import glob
from data_loading import ChatHighlightData

ModuleNotFoundError: ignored

In [8]:
import datasets

## Data prep

### Create dataset

#### Load data from published dataset

In [None]:
!ls

final_data  gt	nalcs_test.txt	nalcs_train.txt  nalcs_val.txt


In [None]:
# load train, val, test files with filenames
ds_info_files = {"train": "nalcs_train.txt", "validation": "nalcs_val.txt", "test": "nalcs_test.txt"}

In [None]:
ds_split_files = {"chat": dict(), "highlights": dict()}
for split, file in ds_info_files.items():
  with open(file, "r") as in_file:
    chat, highlights = zip(*[l.rstrip("\n").split(" ")[-2:] for l in in_file.readlines()])
    ds_split_files["chat"][split] = list(chat)
    ds_split_files["highlights"][split] = list(highlights)

In [None]:
ch_dir = "./final_data/"
hl_dir = "./gt/"

In [None]:
chd_train = ChatHighlightData(chat_dir=ch_dir, highlight_dir=hl_dir, emote_dir=None, frame_rate=30)
chd_train.load_data(file_identifier="nalcs_w*_g[13]")

In [None]:
chd_val = ChatHighlightData(chat_dir=ch_dir, highlight_dir=hl_dir, emote_dir=None, frame_rate=30)
chd_val.load_data(file_identifier="nalcs_w[1-4]*_g2")

In [None]:
chd_test = ChatHighlightData(chat_dir=ch_dir, highlight_dir=hl_dir, emote_dir=None, frame_rate=30)
chd_test.load_data(file_identifier="nalcs_w[5-9]*_g2")

In [None]:
len(chd_train.chat.keys()), len(chd_eval.chat.keys()), len(chd_test.chat.keys())
# fu et al. 2017 report: 128/40/50

(128, 40, 50)

In [None]:
# make sure that the chat and highlights match
print(sorted(chd_train.chat.keys()) == sorted(chd_train.highlights.keys()))
print(sorted(chd_val.chat.keys()) == sorted(chd_val.highlights.keys()))
print(sorted(chd_test.chat.keys()) == sorted(chd_test.highlights.keys()))

True
True
True


#### Load data into huggingface datasets, combining them to datasetdict

In [None]:
def chat_highlight_data_to_huggingface_dataset(chd):

  chat = list()
  highlights = list()
  match_name = list()

  for m, ch in chd.chat.items():
    hl = chd.highlights[m]
    try:
      assert(len(ch) == len(hl))
    except AssertionError:
      print("not matching lengths in:", m)
    name = [m] * len(ch)
    
    chat.extend(ch)
    highlights.extend(hl)
    match_name.extend(name)

  return datasets.Dataset.from_dict({"messages": chat,
                                      "highlights": highlights,
                                      "match_name": match_name
                                    })

In [None]:
ds_train = chat_highlight_data_to_huggingface_dataset(chd_train)

In [None]:
ds_val = chat_highlight_data_to_huggingface_dataset(chd_val)

In [None]:
ds_test = chat_highlight_data_to_huggingface_dataset(chd_test)

In [None]:
ds = datasets.DatasetDict({"train": ds_train,
                      "val": ds_val,
                      "test": ds_test
                      })

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 10580401
    })
    val: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3305229
    })
    test: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3984081
    })
})

In [None]:
!ls

final_data  gt	nalcs_test.txt	nalcs_train.txt  nalcs_val.txt


In [None]:
ds.save_to_disk("fu2017_highlight_detection_dataset")

### Reload saved dataset from disk

In [None]:
ds = datasets.load_from_disk("fu2017_highlight_detection_dataset")

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 10580401
    })
    val: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3305229
    })
    test: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3984081
    })
})