In [1]:
import pandas
import os
import glob
from collections import defaultdict


In [32]:

def parse_lines(lines, i, y):
    line_x = lines[i+1].strip()
    log_l_x = line_x.split(',')[0].split('=')[1]
    aicc_x = line_x.split(',')[1].split('=')[1].split("(")[0]
    line_y = lines[i+y].strip()
    dnds_y = line_y.split('=')[1]
    return log_l_x.strip(), aicc_x.strip(), dnds_y.strip()

def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    gene = os.path.basename(file_path).split('.')[0]
    new_line = defaultdict()
    new_line["gene"] = gene
    a_gtr = None
    b_gtr = None
    c_gtr = None
    a_cod = None
    b_cod = None
    c_cod = None
    p_value_line = None
    gtr_string = 'Obtaining the global omega estimate based on relative GTR branch lengths and nucleotide substitution biases'
    dn_ds_string = 'Improving branch lengths, nucleotide substitution biases, and global dN/dS ratios under a full codon model'
    p_value_string = 'Branch-site unrestricted statistical test of episodic diversification'
    dnds_string = 'Diversifying selection'

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith('#'):
            if gtr_string in line:
                a_gtr,b_gtr,c_gtr = parse_lines(lines, i, 3)
                #print(a_gtr,b_gtr,c_gtr)
                new_line["gtr_log_l"] = a_gtr
                new_line["gtr_aicc"] = b_gtr
                new_line["gtr_dnds"] = c_gtr
            elif dn_ds_string in line:
                a_cod, b_cod, c_cod = parse_lines(lines, i, 2)
                #print(a_cod,b_cod,c_cod)
                new_line["codon_log_l"] = a_cod
                new_line["codon_aicc"] = b_cod
                new_line["codon_dnds"] = c_cod
            elif p_value_string in line:
                p_value_line = lines[i+1].strip()
                p_val = p_value_line.split('=')[1].split('*')[0].strip()
                #print(p_val)
                new_line["p_val"] = p_val
        elif dnds_string in line:
            print(line)
            dnds_val = line.split('|')[2].strip()
            dnds_proportion = line.split('|')[3].strip()
            dnds_note = str(line.split('|')[4].strip())
            print(dnds_val, dnds_proportion, dnds_note)
            new_line['dNdS'] = dnds_val
            new_line['dNdS_proportion'] = dnds_proportion
            new_line['dNdS_note'] = dnds_note
        i += 1
    return new_line


In [33]:

dir_of_results = 'results'
results_files = glob.glob(f'{dir_of_results}/*.txt')


In [38]:

col_names = ["gene","gtr_log_l","gtr_aicc","gtr_dnds","codon_log_l","codon_aicc","codon_dnds","p_val", "dNdS", "dNdS_proportion", "dNdS_note"]
rows = []
for infile in results_files:
    print(infile)
    new_line = process_file(infile)
    rows.append(new_line)

results_df = pandas.DataFrame(rows, columns=col_names)
results_df.reset_index(drop=True)


results/GC_00000978-busted.txt
|      Diversifying selection       |    297.825    |    6.495    |                                   |
297.825 6.495 
|      Diversifying selection       |    16.010     |   35.858    |                                   |
16.010 35.858 
results/GC_00000294-busted.txt
|      Diversifying selection       |     4.701     |    0.000    |       Not supported by data       |
4.701 0.000 Not supported by data
results/GC_00000594-busted.txt
|      Diversifying selection       |    170.614    |   19.191    |                                   |
170.614 19.191 
results/GC_00000480-busted.txt
results/GC_00000839-busted.txt
|      Diversifying selection       |  150880.454   |    7.666    |                                   |
150880.454 7.666 
|      Diversifying selection       |     1.168     |    0.000    |       Not supported by data       |
1.168 0.000 Not supported by data
results/GC_00001136-busted.txt
results/GC_00000157-busted.txt
|      Diversifying selecti

Unnamed: 0,gene,gtr_log_l,gtr_aicc,gtr_dnds,codon_log_l,codon_aicc,codon_dnds,p_val,dNdS,dNdS_proportion,dNdS_note
0,GC_00000978-busted,-2346.93,4830.55,0.1721,-1243.69,2624.06,0.0426,0.2867,16.010,35.858,
1,GC_00000294-busted,-2249.55,4642.11,0.0855,-854.63,1852.25,0.0955,0.5000,4.701,0.000,Not supported by data
2,GC_00000594-busted,-770.73,1678.79,18.8942,-770.73,1678.79,18.8942,0.1216,170.614,19.191,
3,GC_00000480-busted,,,,,,,,,,
4,GC_00000839-busted,-2381.19,4909.45,0.6832,-2084.15,4315.37,0.4432,0.5000,1.168,0.000,Not supported by data
...,...,...,...,...,...,...,...,...,...,...,...
922,GC_00000446-busted,,,,,,,,,,
923,GC_00000718-busted,,,,,,,,,,
924,GC_00000207-busted,-285.21,700.56,0.0092,-285.21,700.56,0.0088,0.5000,4.093,0.191,
925,GC_00000565-busted,-1617.59,3388.83,0.0715,-775.32,1704.29,0.1604,0.5000,1.614,0.000,Not supported by data


In [40]:
results_df.to_csv('results_dNdS_busted_amine_v2.csv', index=False)