In [29]:
import os
from itertools import groupby
from Bio import AlignIO
from Bio.AlignIO.PhylipIO import RelaxedPhylipWriter
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

features = open("data/util/features.csv", "r").read().split("\t")
feat_borders = {}
cur_feature = ""
lower = 0
for (i, code) in enumerate(features):
    feature = ''.join([''.join(g) for _, g in groupby(code, str.isalpha)][0:2])
    if feature != cur_feature:
        if cur_feature != "":
            feat_borders[cur_feature] = (lower, i-1)
        lower = i
        cur_feature = feature
feat_borders[cur_feature] = (lower, i-1)

In [34]:
states = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+",
",", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^", "_", "{", "|", "}", "~"]


def convert_to_multistate(alignment_file_name, feat_borders):
    max_states = max([upper - lower for (f, (lower, upper)) in feat_borders.items()])
    print(max_states)
    align = AlignIO.read(alignment_file_name, "phylip-relaxed")
    matrix = {}
    for rec in align:
        matrix[rec.id] = ""
    for (f, (lower, upper)) in feat_borders.items():
        for rec in align:
            bs = rec.seq[lower:upper+1]
            q_count = bs.count("?")
            if q_count > 0:
                assert(q_count == len(bs))
                matrix[rec.id] += '?'
                continue
            l_count = bs.count("-")
            if l_count > 0:
                assert(l_count == len(bs))
                matrix[rec.id] += '-'
                continue
            one_count = bs.count("1")
            assert(one_count <= 1)
            if one_count == 0:
                matrix[rec.id] += states[0]
                continue
            ind = bs.index('1')
            matrix[rec.id] += states[ind+1]
    records = [SeqRecord(s, id=l) for (l, s) in matrix.items()]
    align = MultipleSeqAlignment(records, annotations={}, column_annotations={})
    file_name = ".".join((alignment_file_name.split('.')[:-1])) + ".MULTI.phy"
    with open(file_name,"w+") as f:
        writer = RelaxedPhylipWriter(f)
        writer.write_alignment(align)      
    print(matrix)
    
convert_to_multistate("data/language_alignments/morpho.phy", feat_borders)

8
{'It': '13423132223221213211322113413121112611121112321112123222211116114341112111211123531211213511', 'Sp': '11323132223221213311122113413121112611111111321112123222211116114342112331211123531111213511', 'Cat': '11221132223221213311222113413121112611111112321122123222211111114341112111211121131211211470', 'Fr': '13423132223221213125222111413121112611111112121112121222211111114341111111211123131211211370', 'Ptg-E': '13423132223221213315222113413121112611112312321112121222211111114341112111211123131211213511', 'Rm': '11323132323223213211232123112121112611112312321112121222231116114342112331211121531212213511', 'Lat': '?2221112323225413412212155133121112411111221421112143123111323112212112111111122131211113556', 'ClG': '?2233112323221313315412145133121112411111221211132143323111321114332112111211123131211113556', 'Grk': '12332132323221213314222135113121112311112111421112143222111115114332112331211121131212113556', 'Got': '?222111?323222313135212155113121112411112?223111211413231113??11

In [37]:
def part_analysis_input(feat_borders, model):
    string = ""
    for (f, (lower, upper)) in feat_borders.items():
        string += model
        string += ", "
        string += f
        string += '='
        string += str(lower+1)
        string += "-"
        string += str(upper+1)
        string += "\n"
    return string
        
print(part_analysis_input(feat_borders, "BIN"))

BIN, 20A=1-7
BIN, 21A=8-12
BIN, 23A=13-17
BIN, 24A=18-22
BIN, 25A=23-27
BIN, 26A=28-33
BIN, 27A=34-36
BIN, 28A=37-40
BIN, 30A=41-45
BIN, 31A=46-48
BIN, 32A=49-51
BIN, 33A=52-60
BIN, 34A=61-66
BIN, 37A=67-71
BIN, 38A=72-76
BIN, 39A=77-81
BIN, 40A=82-86
BIN, 41A=87-91
BIN, 42A=92-94
BIN, 43A=95-100
BIN, 44A=101-106
BIN, 45A=107-110
BIN, 47A=111-112
BIN, 48A=113-116
BIN, 49A=117-125
BIN, 50A=126-131
BIN, 51A=132-140
BIN, 52A=141-143
BIN, 53A=144-151
BIN, 55A=152-154
BIN, 57A=155-158
BIN, 58A=159-160
BIN, 58B=161-164
BIN, 59A=165-168
BIN, 60A=169-174
BIN, 62A=175-182
BIN, 63A=183-184
BIN, 64A=185-187
BIN, 65A=188-189
BIN, 66A=190-193
BIN, 67A=194-195
BIN, 68A=196-199
BIN, 69A=200-204
BIN, 70A=205-209
BIN, 71A=210-213
BIN, 73A=214-215
BIN, 74A=216-218
BIN, 75A=219-221
BIN, 76A=222-224
BIN, 79A=225-228
BIN, 80A=229-233
BIN, 81A=234-240
BIN, 82A=241-243
BIN, 83A=244-246
BIN, 85A=247-251
BIN, 86A=252-254
BIN, 87A=255-258
BIN, 88A=259-264
BIN, 89A=265-268
BIN, 90A=269-275
BIN, 91A=276-278
BIN, 

In [None]:
GTR+G+FO, cob=1-1248