In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os, re, sys
sys.path.insert(0, '/home/paul/language-ident-from-speech/misc')
from get_results import calculate_f_score
from os.path import join, exists

In [15]:
def add_f1_data_normal(df):
    f1_df = pd.DataFrame(columns=["expname", "f1_3s", "f1_10s", "f1_30s"])
    n = len(df)
    langs = []
    for i in range(n):
        result_row = [df["expname"].iloc[i]]
        target_lang = df["excluded_languages"].iloc[i]
        langs.append(target_lang)
        if target_lang == "ALL":
            continue
        for secs in ["3s", "10s", "30s"]:
            col = "classification_" + secs
            class_path = df[col][i]
            if class_path != "blank":
                prec, recall, f1 = calculate_f_score(class_path, target_lang, beta=1)
                result_row.append(f1)
            else:
                result_row.append("none")
        f1_df.loc[i] = result_row
    df = df.merge(f1_df, on="expname", how="outer")
    return df, langs

def add_f1_baseline(df, langs):
    n = len(df)
    rows_list = []
    for i in range(n):
        expname = [df["expname"].iloc[i]][0]
        for target_lang in langs:
            result_row = [expname, target_lang]
            for secs in ["3s", "10s", "30s"]:
                col = "classification_" + secs
                class_path = df[col][i]
                if class_path != "blank":
                    prec, recall, f1 = calculate_f_score(class_path, target_lang, beta=1)
                    result_row.append(f1)
                else:
                    result_row.append("none")
            rows_list.append(result_row)
    f1_df = pd.DataFrame(rows_list, columns=["expname", "excluded_languages", "f1_3s", "f1_10s", "f1_30s"])
    #df = df.merge(f1_df, on="expname", how="outer")
    return f1_df

def generate_col_names():
    names = []
    for stat in ["mean", "std"]:
        for i in ["3", "10", "30"]:
            names.append(stat + "_f1_" + i + "s")
    return names

def average_baseline_df(df):
    df.sort_values("excluded_languages", inplace=True)
    df.reset_index(drop=True, inplace=True)
    numeric_cols = ["f1_3s", "f1_10s", "f1_30s"]
    cols = ["excluded_language"] + generate_col_names()
    rows = []
    n = len(df)
    for i in np.arange(0, n, step=3):
        lang = df["excluded_languages"][i]
        if (lang != df["excluded_languages"][i+1] and 
            lang != df["excluded_languages"][i+2]):
            print("Warning: the three languages are not the same")
            continue
        mean = df.loc[[i, i+1, i+2]][numeric_cols].mean()
        std = df.loc[[i, i+1, i+2]][numeric_cols].std()
        rows.append([lang] + list(mean) + list(std))
    avg_df = pd.DataFrame(rows, columns=cols)
    return avg_df

def average_df(df):
    exp_list = get_expname_and_number(list(df["expname"]))
    cols = ["excluded_language"] + generate_col_names()
    rows = []
    for i, exps in enumerate(exp_list):
        excluded_lang = get_lang(expname)
         for exp in exps:
            
            df_results["mean_" + category].loc[i] = mean
            df_results["std_" + category].loc[i] = std
    df_results = df_results.sort_values("enroll_length")
    df_results = df_results.sort_values("train_length")
    df_results = df_results.reset_index(drop=True)
    return df_results

def get_lang(expname):
    

def get_expname_and_number(expnames):
    exp_list = []
    found_names = []
    counts = []
    for exp in expnames:
        if exp[-2] == "_":
            name = exp[:-2]
        else:
            name = exp
        if name not in found_names:
            found_names.append(name)
            counts.append(1)
        else:
            idx = found_names.index(name)
            counts[idx] += 1
    for idx, name in enumerate(found_names):
        count = counts[idx]
        name_list = [name]
        if count > 1:
            for i in range(2, count + 1):
                new_name = name + "_" + str(i)
                name_list.append(new_name)
        exp_list.append(name_list)
    return exp_list
        

In [3]:
ad_summary = pd.read_csv(join(os.getcwd(), "results", "ad_all_summary.csv"))
baseline = ad_summary[ad_summary.excluded_languages == "ALL"]
baseline.reset_index(drop=True, inplace=True)
baseline.drop("excluded_languages", axis=1, inplace=True)
ad_normal = ad_summary[ad_summary.excluded_languages != "ALL"]
ad_normal.reset_index(drop=True, inplace=True)

ad_normal, langs = add_f1_data_normal(ad_normal)
baseline_new = add_f1_baseline(baseline, langs)
avg_baseline = average_ad_df(baseline_new,3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [16]:
exp_list = get_expname_and_number(ad_normal["expname"])

In [17]:
exp_list

[['ad_all_tr_no_ar'],
 ['ad_all_tr_no_bg'],
 ['ad_all_tr_no_ch'],
 ['ad_all_tr_no_cr'],
 ['ad_all_tr_no_cz'],
 ['ad_all_tr_no_fr'],
 ['ad_all_tr_no_ge'],
 ['ad_all_tr_no_ja'],
 ['ad_all_tr_no_ko'],
 ['ad_all_tr_no_pl'],
 ['ad_all_tr_no_po'],
 ['ad_all_tr_no_ru'],
 ['ad_all_tr_no_sp'],
 ['ad_all_tr_no_sw'],
 ['ad_all_tr_no_th'],
 ['ad_all_tr_no_tu'],
 ['ad_all_tr_no_vn'],
 ['ad_all_tr_no_wu']]