In [1]:
import pandas as pd
import time
from tqdm import tqdm

In [2]:
def read_csv_pgbar(csv_path, chunksize, dtype=object):
 
    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    chunk_list = []
 
    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize,
                                 dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))
 
    df = pd.concat((f for f in chunk_list), axis=0)
    print('Done!')
 
    return df

In [3]:
def calc_metrics(file, levels, assay):
    
    df = read_csv_pgbar(file, chunksize=10**6)
    df['degree'] = df['degree'].astype(str).astype(int)
    
    final_df = pd.DataFrame()
    for level in levels:

        print(f"Computing precision/recall for relationships of degree {level} ...")
        tp_count = 0
        fp_count = 0
        
        p_list = []
        r_list = []
        f_list = []
        
        df_lev = df[df['degree'].isin([level, -1])]
        df_lev = df_lev.sort_values(by=f'{assay}_rel',
                        ascending=False).reset_index(drop=True)
        all_pos = len(df_lev[df_lev['degree'] == level])
        print(f"Total relationships of degree {level} = {all_pos}")
        
        # make sure the label exists
        if all_pos < 1:
            print(f"No relationships of degree {level} detected.")
            continue

        for i in range(len(df_lev)):

            deg = df_lev['degree'][i]

            if deg == level:
                tp_count += 1
            else:
                fp_count += 1

            tps = tp_count
            fps = fp_count
            fns = all_pos - tps

            precision = tps/(tps+fps)
            recall = tps/(tps+fns)
            fdr = 1 - precision

            p_list.append(float(precision))
            r_list.append(float(recall))
            f_list.append(float(fdr))

        pr_df = pd.DataFrame()
        pr_df['precision'] = p_list
        pr_df['recall'] = r_list
        pr_df['fdr'] = f_list
        pr_df['degree'] = level
        
        final_df = final_df.append(pr_df)
    
    return final_df

In [4]:
assay_list = ['ay49', 'ay52', 'ay72', 'all']
level_list = [0, 1, 2, 3]

In [None]:
for a in assay_list[0:2]:
    t0 = time.time()
    file = f'../assays/{a}/results/plink_fmt/{a}_fmat_labeled_updated.csv'
    out = f'../assays/{a}/results/{a}_pr.csv'
    print(f"Reading results from {file} ...")
    df_pr = calc_metrics(file = file, 
                         levels = level_list, 
                         assay = a)
    print(df_pr.head())
    print(len(df_pr))
    print(f"Writing PR results to {out} ...")
    df_pr.to_csv(out, chunksize=10**6, index=False)
    print(f"Done! Total calculation time: {time.time() - t0}")

Reading results from ../assays/ay49/results/plink_fmt/ay49_fmat_labeled_updated.csv ...


Rows read: 100%|███████████████████████████████| 212664376/212664376 [15:25<00:00, 229761.92it/s]


Done!
Computing precision/recall for relationships of degree 0 ...
Total relationships of degree 0 = 7


  final_df = final_df.append(pr_df)


Computing precision/recall for relationships of degree 1 ...
Total relationships of degree 1 = 3136


  final_df = final_df.append(pr_df)


Computing precision/recall for relationships of degree 2 ...
Total relationships of degree 2 = 528


  final_df = final_df.append(pr_df)


Computing precision/recall for relationships of degree 3 ...
Total relationships of degree 3 = 179


  final_df = final_df.append(pr_df)


   precision  recall  fdr  degree
0        0.0     0.0  1.0       0
1        0.0     0.0  1.0       0
2        0.0     0.0  1.0       0
3        0.0     0.0  1.0       0
4        0.0     0.0  1.0       0
850645954
Writing PR results to ../assays/ay49/results/ay49_pr.csv ...


In [None]:
# t0 = time.time()
# file = f'../assays/ay49/results/pilot/RQ2415302_fmat_updated.csv'
# out = f'../assays/ay49/results/pilot/RQ2415302_pr.csv'
# print(f"Reading results from {file} ...")
# df_pr = calc_metrics(file = file, 
#                      levels = level_list, 
#                      assay = 'ay49')
# print(df_pr.head())
# print(len(df_pr))
# print(f"Writing PR results to {out} ...")
# df_pr.to_csv(out, chunksize=10**6, index=False)
# print(f"Done! Total calculation time: {time.time() - t0}")