In [1]:
# use SELECTED_TURN_EMOTION.csv

# selected labels and transcriptions of each session

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('SELECTED_TURN_EMOTION.csv')
df

Unnamed: 0,TURN_NAME,EMOTION
0,Ses01F_impro01_F000,neu
1,Ses01F_impro01_F001,neu
2,Ses01F_impro01_F002,neu
3,Ses01F_impro01_F005,neu
4,Ses01F_impro01_F014,neu
...,...,...
5526,Ses05M_script03_2_M041,ang
5527,Ses05M_script03_2_M042,ang
5528,Ses05M_script03_2_M043,ang
5529,Ses05M_script03_2_M044,ang


In [4]:
def open_text_files_in_dir(paths):
    
    main_df = pd.DataFrame()
    
    for path in paths:
        impros = glob.glob(os.path.join(path,'*'))

        for f in impros:
            df = pd.read_csv(f, sep=':', header=None)
            main_df = main_df.append(df, ignore_index=True)
            
    return main_df

In [5]:
import glob
import os

In [6]:
def csv_transcription_labels_from_session(paths, session):
    
    text_session = pd.DataFrame()
    text_session = text_session.append(open_text_files_in_dir(paths))
    
    text_session[0] = text_session[0].apply(lambda x: x.split()[0])
    text_session.columns = ['TURN_NAME','TRANSCRIPTION']
    
    selected_labeled_text = pd.merge(text_session, df, on=["TURN_NAME"])
    print(selected_labeled_text.describe()[:2])
    selected_labeled_text[['EMOTION', 'TRANSCRIPTION']].to_csv(
        'Session{}_selected_transcriptions_Emotions.csv'.format(session), index=False)
    print('Session{} transcriptions were saved as csv'.format(session))
    print()

In [7]:
sessions = [1, 2, 3, 4, 5]
for session in sessions:
    path = 'IEMOCAP_full_release/Session{}/dialog/transcriptions'.format(session)
    csv_transcription_labels_from_session([path], session)

       TURN_NAME TRANSCRIPTION EMOTION
count       1085          1085    1085
unique      1085           973       4
Session1 transcriptions were saved as csv

       TURN_NAME TRANSCRIPTION EMOTION
count       1023          1023    1023
unique      1023           947       4
Session2 transcriptions were saved as csv

       TURN_NAME TRANSCRIPTION EMOTION
count       1151          1151    1151
unique      1151          1082       4
Session3 transcriptions were saved as csv

       TURN_NAME TRANSCRIPTION EMOTION
count       1031          1031    1031
unique      1031           940       4
Session4 transcriptions were saved as csv

       TURN_NAME TRANSCRIPTION EMOTION
count       1241          1241    1241
unique      1241          1132       4
Session5 transcriptions were saved as csv



# create dataset and upload

In [8]:
from datasets import Dataset
from datasets import Value, ClassLabel

In [9]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\mini\.cache\huggingface\token
Login successful


In [11]:
names = ['ang', 'hap','neu', 'sad']
for session in sessions:
    df = pd.read_csv('Session{}_selected_transcriptions_Emotions.csv'.format(session))
    dataset = Dataset.from_pandas(df, split='Session{}'.format(session))
    dataset = dataset.cast_column("TRANSCRIPTION", Value('string'))
    dataset = dataset.cast_column("EMOTION", ClassLabel(num_classes=4,names=names))
    dataset = dataset.rename_column("EMOTION", "emotion")
    dataset = dataset.rename_column("TRANSCRIPTION", "transcription")
    dataset.push_to_hub("minoosh/IEMOCAP_Text")

Casting the dataset:   0%|          | 0/1085 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1085 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1023 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1023 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Updating downloaded metadata with the new split.


Casting the dataset:   0%|          | 0/1151 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1151 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/563 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


Casting the dataset:   0%|          | 0/1031 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1031 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/627 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


Casting the dataset:   0%|          | 0/1241 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1241 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/690 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [12]:
dataset

Dataset({
    features: ['emotion', 'transcription'],
    num_rows: 1241
})

In [13]:
dataset[-1]

{'emotion': 0, 'transcription': ' pig'}

# load_dataset

In [15]:
from datasets import load_dataset, load_from_disk

dataset = load_dataset("minoosh/IEMOCAP_Text")
#dataset = load_from_disk("minoosh/IEMOCAP_Text")

Downloading and preparing dataset None/None to C:/Users/mini/.cache/huggingface/datasets/minoosh___parquet/minoosh--IEMOCAP_Text-8b3962ea9075930e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


HF google storage unreachable. Downloading and preparing it from source


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/44.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/41.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating Session2 split:   0%|          | 0/1023 [00:00<?, ? examples/s]

Generating Session5 split:   0%|          | 0/1241 [00:00<?, ? examples/s]

Generating Session4 split:   0%|          | 0/1031 [00:00<?, ? examples/s]

Generating Session3 split:   0%|          | 0/1151 [00:00<?, ? examples/s]

Generating Session1 split:   0%|          | 0/1085 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/mini/.cache/huggingface/datasets/minoosh___parquet/minoosh--IEMOCAP_Text-8b3962ea9075930e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
dataset

DatasetDict({
    Session2: Dataset({
        features: ['emotion', 'transcription'],
        num_rows: 1023
    })
    Session5: Dataset({
        features: ['emotion', 'transcription'],
        num_rows: 1241
    })
    Session4: Dataset({
        features: ['emotion', 'transcription'],
        num_rows: 1031
    })
    Session3: Dataset({
        features: ['emotion', 'transcription'],
        num_rows: 1151
    })
    Session1: Dataset({
        features: ['emotion', 'transcription'],
        num_rows: 1085
    })
})

In [17]:
dataset['Session1'][0]

{'emotion': 2, 'transcription': ' Excuse me.'}