# Exploration of the SVQTD dataset

In [1]:
import itertools
import pandas as pd

## Read data from provided csv files
Set feature column data types and also fix column name in test dataset

In [2]:
feature_cols = ["Chest", "Head", "Open", "Breathy", "Vibrato", "Front", "Back"]
dtypes_ = {i: int for i in feature_cols}

dev = pd.read_csv("../SVQTD/dev.csv", dtype=dtypes_)
seg_time_lists = pd.read_csv("../SVQTD/seg_time_lists.csv", dtype=dtypes_)
test = pd.read_csv("../SVQTD/test.csv", dtype=dtypes_ | {"Openess": int})
train = pd.read_csv("../SVQTD/train.csv", dtype=dtypes_)

# Fix column name in test
test.rename(columns={"Openess": "Open"}, inplace=True)

In [3]:
seg_time_lists

Unnamed: 0,name,num,time_start,time_end
0,test,0,0.000,0.000
1,0HVjTPqe5I0,0,8.893,17.323
2,0HVjTPqe5I0,1,18.053,28.832
3,0HVjTPqe5I0,2,39.120,53.974
4,0HVjTPqe5I0,3,56.265,64.498
...,...,...,...,...
7351,-kbi1EMcD3E,9,2.554,17.679
7352,-kbi1EMcD3E,10,18.072,30.581
7353,-kbi1EMcD3E,11,30.849,34.994
7354,-kbi1EMcD3E,12,35.098,156.263


In [4]:
datasets = {"dev": dev, "test": test, "train": train}
for key, data_ in datasets.items():
    print(key)
    display(data_)

dev


Unnamed: 0,Link,seg_num,Chest,Head,Open,Breathy,Vibrato,Front,Back,song
0,yp3Vi1CspgE,6,2,2,0,0,1,1,1,Vesti_la_giubba
1,QGrQgrHBzRw,3,2,1,0,1,0,0,0,Una_furtiva_lagrima
2,7K6YTjuqgBQ,2,3,2,0,0,1,0,0,Una_furtiva_lagrima
3,mONTSRzdRZw,9,3,3,0,0,1,0,0,Vesti_la_giubba
4,h8_AfrqbT5k,0,2,2,0,0,1,0,0,Una_furtiva_lagrima
...,...,...,...,...,...,...,...,...,...,...
874,7K6YTjuqgBQ,7,3,2,0,0,1,0,0,Una_furtiva_lagrima
875,1dfyre45UPE,15,3,2,0,0,1,0,0,Una_furtiva_lagrima
876,iyQyaIl2rfI,7,2,1,0,0,1,0,0,Una_furtiva_lagrima
877,7L6eCz54_d8,17,2,2,0,0,1,0,0,Una_furtiva_lagrima


test


Unnamed: 0,Link,seg_num,Head,Chest,Open,Breathy,Vibrato,Front,Back,song
0,tj--7L9MJTo,0,0,0,2,1,2,1,1,Una_furtiva_lagrima
1,zb7tCie8sFE,14,3,3,0,0,0,0,0,Una_furtiva_lagrima
2,230L9mf9BRY,0,2,2,0,0,1,0,0,Una_furtiva_lagrima
3,1VCAZQ9M22M,6,2,2,0,0,1,0,0,Una_furtiva_lagrima
4,1MFnSZ1GPMU,9,3,2,0,0,1,0,0,Una_furtiva_lagrima
...,...,...,...,...,...,...,...,...,...,...
874,vtdBBl0RkQo,1,1,1,1,0,0,0,0,Una_furtiva_lagrima
875,qOVjWb8xcYc,6,1,1,1,0,0,0,1,Una_furtiva_lagrima
876,pLaxGoUByTE,9,2,1,0,0,1,0,0,Vesti_la_giubba
877,qNPPPJu09lg,11,2,0,1,0,0,0,0,Una_furtiva_lagrima


train


Unnamed: 0,Link,seg_num,Head,Chest,Open,Breathy,Vibrato,Front,Back,song
0,wTyNiJ4bpbk,0,1,1,2,1,2,2,1,Che_gelida_manina
1,o3lWEBQevNo,0,1,2,0,1,1,0,0,Che_gelida_manina
2,y5eddiim6BQ,0,2,2,1,0,1,1,1,Che_gelida_manina
3,0QzJ86rMHzQ,0,2,1,0,0,1,0,0,Che_gelida_manina
4,xVilszbi-mU,0,1,2,3,0,1,0,0,Che_gelida_manina
...,...,...,...,...,...,...,...,...,...,...
2269,RFA_ilOtGCM&list=PLpEpmwpNk-6dBCxujxeIo9fZGZfx...,29,2,2,0,0,0,0,0,Nessun_Dorma
2270,RFA_ilOtGCM&list=PLpEpmwpNk-6dBCxujxeIo9fZGZfx...,30,1,1,0,0,0,0,0,Nessun_Dorma
2271,RFA_ilOtGCM&list=PLpEpmwpNk-6dBCxujxeIo9fZGZfx...,31,2,1,1,1,0,0,0,Nessun_Dorma
2272,RFA_ilOtGCM&list=PLpEpmwpNk-6dBCxujxeIo9fZGZfx...,32,2,2,0,0,0,0,0,Nessun_Dorma


## Check that same links do not appear across different datasets

In [5]:
for keys in itertools.combinations(datasets.keys(), 2):
    links = list(map(lambda key: set(datasets[key]["Link"].tolist()), keys))
    sets_are_disjoint = links[0].isdisjoint(links[1])
    print(f"{keys[0]} and {keys[1]} links are disjoint: {sets_are_disjoint}")
    if not sets_are_disjoint:
        common = links[0] & links[1]
        print(
            f"\t{len(common)} common links; {len(links[0])} distinct links in {keys[0]} and {len(links[1])} in {keys[1]}"
        )

dev and test links are disjoint: False
	100 common links; 100 distinct links in dev and 100 in test
dev and train links are disjoint: True
test and train links are disjoint: True


It quite much seems that `dev` and `test` are very much similar. That is not a problem per se, because `dev` set will not be used in training. Probably `dev` could be used as _test set in development phase_.

However, let's investigate further and see if the actual items are the same

In [6]:
pd.merge(
    left=dev,
    right=test,
    left_on=["Link", "seg_num"],
    right_on=["Link", "seg_num"],
    how="inner",
)

Unnamed: 0,Link,seg_num,Chest_x,Head_x,Open_x,Breathy_x,Vibrato_x,Front_x,Back_x,song_x,Head_y,Chest_y,Open_y,Breathy_y,Vibrato_y,Front_y,Back_y,song_y
0,wKF-6qtJqnY,12,2,2,0,0,2,2,1,Vesti_la_giubba,2,2,0,0,2,2,1,Vesti_la_giubba
1,wKF-6qtJqnY,6,2,2,0,0,1,2,1,Vesti_la_giubba,2,2,0,0,1,2,1,Vesti_la_giubba
2,wKF-6qtJqnY,11,2,2,2,0,1,1,1,Vesti_la_giubba,2,2,2,0,1,1,1,Vesti_la_giubba
3,wKF-6qtJqnY,7,2,3,2,0,1,2,2,Vesti_la_giubba,2,3,2,0,1,2,2,Vesti_la_giubba


It seems that there are only four shared items. In most cases, the segments are just divided between dev and test set.

## Check segments

See that `seg_time_lists` contains all link - segment combinations in `train`, `test` and `dev`.

In [7]:
data_segs = pd.concat(
    [dset[["Link", "seg_num"]].assign(origin=key) for key, dset in datasets.items()],
    axis=0,
)
seg_joined = pd.merge(
    left=data_segs,
    right=seg_time_lists,
    left_on=["Link", "seg_num"],
    right_on=["name", "num"],
    how="outer",
)

print("Items in datasets but not in seg_time_lists:")
display(seg_joined[seg_joined["name"].isna()])

print("Items in seg_time_lists but not in datasets:")
display(seg_joined[seg_joined["Link"].isna()])

Items in datasets but not in seg_time_lists:


Unnamed: 0,Link,seg_num,origin,name,num,time_start,time_end
98,1VCAZQ9M22M,10.0,dev,,,,
287,1VCAZQ9M22M,1.0,dev,,,,
640,1VCAZQ9M22M,3.0,dev,,,,
783,1VCAZQ9M22M,5.0,dev,,,,
825,1VCAZQ9M22M,15.0,dev,,,,
904,1VCAZQ9M22M,6.0,test,,,,
910,1VCAZQ9M22M,13.0,test,,,,
1014,1VCAZQ9M22M,2.0,test,,,,
1197,1VCAZQ9M22M,0.0,test,,,,
1329,1VCAZQ9M22M,8.0,test,,,,


Items in seg_time_lists but not in datasets:


Unnamed: 0,Link,seg_num,origin,name,num,time_start,time_end
4067,,,,test,0.0,0.000,0.000
4068,,,,0HVjTPqe5I0,0.0,8.893,17.323
4069,,,,0HVjTPqe5I0,1.0,18.053,28.832
4070,,,,0HVjTPqe5I0,2.0,39.120,53.974
4071,,,,0HVjTPqe5I0,3.0,56.265,64.498
...,...,...,...,...,...,...,...
7395,,,,z8tvfpd11Us,14.0,156.365,166.817
7396,,,,z8tvfpd11Us,15.0,167.570,177.402
7397,,,,z8tvfpd11Us,16.0,177.872,185.142
7398,,,,z8tvfpd11Us,17.0,185.664,192.482


Apparently some clean-up in `test` and `dev` datasets is required: there is no segment data defined for one link.