In [1]:
# IMPORTS
import pandas as pd

##### Other functions

In [2]:
def to_seconds(t):
    return (t.hour * 60 * 60) + (t.minute * 60) + t.second + (t.microsecond / 1000000)

##### Main function

In [3]:
def load_annotations_file(path: str) -> pd.DataFrame:
    """
    Load annotations from a file.

    :param path: Path to the file containing the annotations.
    :return: A pandas DataFrame containing the annotations.
    """
    # Read the annotations file
    annotations = pd.read_csv(path, sep='\t', header=None)

    # Add column names
    annotations.columns = ["level", "", "start", "end", "duration", "label"]
    del annotations[""]

    # Convert to seconds
    annotations["start"] = pd.to_datetime(annotations["start"])
    annotations["end"] = pd.to_datetime(annotations["end"])
    annotations["start"] = annotations["start"].apply(to_seconds)
    annotations["end"] = annotations["end"].apply(to_seconds)

    annotations.reset_index(inplace=True)

    return annotations

##### Test main function

In [4]:
annotations_kj = load_annotations_file("../../../data/raw/annotations_koti_janmani.txt")
annotations_kj

  annotations["start"] = pd.to_datetime(annotations["start"])
  annotations["end"] = pd.to_datetime(annotations["end"])


Unnamed: 0,index,level,start,end,duration,label
0,0,sancara,1.985,5.238,00:00:03.253,nnsnndm
1,1,sancara,5.238,8.300,00:00:03.062,mgmpmmgrg
2,2,sancara,12.492,17.059,00:00:04.567,gssnnndpdndm
3,3,sancara,17.059,18.776,00:00:01.717,mgmpmpgr
4,4,sancara,21.417,24.871,00:00:03.454,gmpdpdndmpgr
...,...,...,...,...,...,...
431,431,underlying_full_phrase,452.423,456.867,00:00:04.444,ssnsrsndmmgmpmggrs
432,432,underlying_full_phrase,457.812,462.967,00:00:05.155,grsssnrsnndmgmn
433,433,underlying_full_phrase,463.194,467.639,00:00:04.445,ssnsrsndmmgmpmggrs
434,434,underlying_full_phrase,473.418,479.629,00:00:06.211,nrsnsndmgmpmgrgmgmn


In [5]:
annotations_vnk = load_annotations_file("../../../data/raw/annotations_vanajaksha_ninni_kore.txt")
annotations_vnk

  annotations["start"] = pd.to_datetime(annotations["start"])
  annotations["end"] = pd.to_datetime(annotations["end"])


Unnamed: 0,index,level,start,end,duration,label
0,0,sancara,40.357,43.641,00:00:03.284,nsnsgrgm
1,1,sancara,43.685,48.104,00:00:04.419,pmgrs
2,2,sancara,48.358,51.596,00:00:03.238,nsgrssnn
3,3,sancara,51.596,54.355,00:00:02.759,pnnsn
4,4,sancara,54.355,56.377,00:00:02.022,sggm
...,...,...,...,...,...,...
677,677,root_full_phrase,483.029,485.349,00:00:02.320,sndnndm
678,678,root_full_phrase,485.372,489.424,00:00:04.052,mgmdpmgrgmpmgrggrsn
679,679,root_full_phrase,489.424,492.393,00:00:02.969,sgrgmgmdpmgm
680,680,root_full_phrase,492.477,494.537,00:00:02.060,gmdpmgrsn


##### Extra material

In [None]:
# Put into dataframes all the annotations for each level
""" annotations_sancara = annotations[annotations["level"]=="sancara"].reset_index()
annotations_underlying_sancara = annotations[annotations["level"]=="underlying_sancara"].reset_index()
annotations_full_phrase = annotations[annotations["level"]=="full_phrase"].reset_index()
annotations_underlying_full_phrase = annotations[annotations["level"]=="underlying_full_phrase"].reset_index()
"""