In [1]:
import os
import pandas as pd
import numpy as np
from os.path import join, exists

In [7]:
def parse_speaker_list(file_path):
    if not exists(file_path):
        print("File not found at {}".format(file_path))
    else:
        info = {}
        with open(file_path, "r") as f:
            lines = f.read().splitlines()
        for line in lines:
            entry = line.split()
            lang_code = entry[0]
            info[lang_code] = []
            for i in range(1, len(entry)):
                info[lang_code].append(entry[i].strip())
        return info
    
def get_stats(lang):
    csv_path = join("/home", "paul", "language-ident-from-speech", "gp_info", lang + ".csv")
    if exists(csv_path):
        return pd.read_csv(csv_path)
    else:
        print("CSV file not found at: {}".format(csv_path))
        
def get_speaker_lists():
    conf_dir = join("/home", "paul", "language-ident-from-speech", "gp-xvectors-recipe", "conf")
    train_speakers = parse_speaker_list(join(conf_dir, "train_spk.list"))
    enroll_speakers = parse_speaker_list(join(conf_dir, "enroll_spk.list"))
    eval_speakers = parse_speaker_list(join(conf_dir, "eval_spk.list"))
    test_speakers = parse_speaker_list(join(conf_dir, "test_spk.list"))
    return train_speakers, enroll_speakers, eval_speakers, test_speakers

def get_lang_code(lang):
    with open(join("/home", "paul", "language-ident-from-speech", "gp-xvectors-recipe", "conf", "lang_codes.txt"), "r") as f:
        lines = f.read().splitlines()
    for line in lines:
        entry = line.split()
        if entry[1] == lang:
            return entry[0]
        
def filter_df(df, speakers):
    return df[df.spk_id.isin(speakers)]

def print_stats(lang, dataset):
    train_speakers, enroll_speakers, eval_speakers, test_speakers = get_speaker_lists()
    lang_df = get_stats(lang)
    lang_code = get_lang_code(lang)
    lang_tr_df = filter_df(lang_df, train_speakers[lang_code])
    lang_eval_df = filter_df(lang_df, eval_speakers[lang_code])
    lang_enroll_df = filter_df(lang_df, enroll_speakers[lang_code])
    lang_test_df = filter_df(lang_df, test_speakers[lang_code])
    if dataset == "all":
        print_df_stats(lang_df, lang, "All")
    elif dataset == "train":
        print_df_stats(lang_tr_df, lang, "Training")
    elif dataset == "enroll":
        print_df_stats(lang_enroll_df, lang, "Enrollment")
    elif dataset == "eval":
        print_df_stats(lang_eval_df, lang, "Evaluation")
    elif dataset == "test":
        print_df_stats(lang_test_df, lang, "Testing")
    else:
        print("Unknown dataset option {}".format(dataset))
        
def print_df_stats(df, lang, name):
    print("*"*30)
    print(name + " data for " + lang + " stats:")
    print("Total speakers")
    print(len(df))
    print("Genders")
    print(df["gender"].value_counts())
    print("Dialects:")
    print(df["dialect"].value_counts())
    print("Age")
    print("Have speaker data for:")
    print(df["age"].count())
    print(str(round(df["age"].mean(),3)) + "+\-" + str(round(df["age"].std(ddof=0), 3)))
    
    

In [39]:
pd.set_option("display.precision", 3)
    
train_speakers, enroll_speakers, eval_speakers, test_speakers = get_speaker_lists()
ar_df = get_stats("Arabic")
lang_code = get_lang_code("Arabic")
ar_tr = filter_df(ar_df, train_speakers[lang_code])


In [10]:
ref = [("Arabic", 0), ("Bulgarian", 1), ("Chinese-Shanghai", 2),
       ("Croatian", 3), ("Czech", 4), ("French", 5), ("German", 6),
       ("Japanese", 7), ("Korean", 8), ("Mandarin", 9), ("Polish", 10),
       ("Portuguese", 11), ("Russian", 12), ("Spanish", 13),
       ("Swedish", 14), ("Thai", 15), ("Turkish", 16),
       ("Vietnamese", 17)]

for i in range(18):
    print_stats(ref[i][0], "train")

******************************
Training data for Arabic stats:
Total speakers
57
Genders
female    30
male      27
Name: gender, dtype: int64
Dialects:
Tunisian       45
Palestinian    11
Jordanian       1
Name: dialect, dtype: int64
Age
Have speaker data for:
57
27.193+\-12.528
******************************
Training data for Bulgarian stats:
Total speakers
56
Genders
female    29
male      27
Name: gender, dtype: int64
Dialects:
no    53
Name: dialect, dtype: int64
Age
Have speaker data for:
56
33.321+\-14.185
******************************
Training data for Chinese-Shanghai stats:
Total speakers
6
Genders
male      4
female    2
Name: gender, dtype: int64
Dialects:
SH    1
Name: dialect, dtype: int64
Age
Have speaker data for:
6
41.333+\-6.774
******************************
Training data for Croatian stats:
Total speakers
63
Genders
female    41
male      22
Name: gender, dtype: int64
Dialects:
Bosnian                    36
Zagreb dialect             12
Standard Croatian           8

In [29]:
ar_tr.tail(10)

Unnamed: 0,language,spk_id,gender,age,native_language,dialect
64,Arabic,106,female,20,Arabic,Palestinian
65,Arabic,107,female,23,Arabic,Palestinian
66,Arabic,108,male,23,Arabic,Palestinian
68,Arabic,110,female,19,Arabic,Palestinian
69,Arabic,111,female,25,Arabic,Palestinian
70,Arabic,112,female,41,Arabic,Palestinian
73,Arabic,134,female,24,Arabic,Palestinian
75,Arabic,136,male,66,Arabic,Palestinian
77,Arabic,164,male,36,Arabic,Palestinian
78,Arabic,165,male,23,Arabic,Tunisian
