In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
levels = ['class', 'architecture', 'topology', 'superfamily']
columns = levels + ['S35', 'S60', 'S95', 'S100', 'count', 'length', 'resolution']
df = pd.read_csv('cath/cath-domain-list-S100.txt', sep='\s+', names=columns, index_col=0)

In [16]:
for level in levels:
    df[level] = df[level].astype(str)
df['architecture'] = df['class'].str.cat(df['architecture'], sep='.')
df['topology'] = df['architecture'].str.cat(df['topology'], sep='.')
df['superfamily'] = df['topology'].str.cat(df['superfamily'], sep='.')

with open('domain.lst', 'r') as fh:
    domains = fh.read().splitlines()
df = df.loc[domains]

In [17]:
df

Unnamed: 0,class,architecture,topology,superfamily,S35,S60,S95,S100,count,length,resolution
107lA00,1,1.10,1.10.530,1.10.530.40,2,1,4,10,1,162,1.80
108lA00,1,1.10,1.10.530,1.10.530.40,2,1,4,11,1,162,1.80
109lA00,1,1.10,1.10.530,1.10.530.40,2,1,4,21,1,162,1.85
110lA00,1,1.10,1.10.530,1.10.530.40,2,1,4,5,1,162,1.70
111lA00,1,1.10,1.10.530,1.10.530.40,2,1,4,12,1,162,1.80
...,...,...,...,...,...,...,...,...,...,...,...
1b7iA00,3,3.90,3.90.1210,3.90.1210.10,1,3,1,5,1,66,1.65
1b7jA00,3,3.90,3.90.1210,3.90.1210.10,1,3,1,6,1,66,1.65
1b7kA00,3,3.90,3.90.1210,3.90.1210.10,1,3,1,18,1,66,2.50
1b7lA00,1,1.10,1.10.530,1.10.530.10,1,2,1,14,1,130,1.80


In [19]:
for level in levels:
    print(level, df[level].nunique())

class 4
architecture 26
topology 223
superfamily 328


In [22]:
sizes = {level: df[level].value_counts().to_dict() for level in levels}

In [24]:
with open('size.tsv', 'w') as f:
    for level in levels:
        for unit, size in sorted(sizes[level].items()):
            print(level, unit, size, sep='\t', file=f)

In [None]:
with open('truth.tsv', 'w') as fh:
    print('a', 'b', *levels, sep='\t', file=fh)
    for a, b in combinations(domains, 2):
        out = [a, b]
        for level in ('class', 'architecture', 'topology', 'superfamily'):
            out.append(str(int(df.loc[a, level] == df.loc[b, level])))
        print(*out, sep='\t', file=fh)

In [2]:
levels = ['class', 'architecture', 'topology', 'superfamily']

In [3]:
methods = ['TM-align', 'TM-Vec', 'TM-Vec 2', 'TM-Vec Student']

In [4]:
dfs = [pd.read_csv(f'data/{x}.csv.gz').rename(columns={'tm_score': x}) for x in methods]

In [5]:
# TM-vec has a suffix like "/1-150" after seq_id. Remove it.
for df in dfs[1:]:
    for i in (1, 2):
        df[f'seq{i}_id'] = df[f'seq{i}_id'].str.split('/').str[0]

In [6]:
# Remove prefix "cath|4_4_0|"
for df in dfs:
    for i in (1, 2):
        df[f'seq{i}_id'] = df[f'seq{i}_id'].str.split('|').str[2]

In [7]:
# FoldSeek tm_score is object because some are NaN. Cast it to float.
# df = dfs[3]
# df['foldseek'] = df['foldseek'].astype(float)

In [8]:
# Sequence pairs are non-redundant and order is not fixed. Sort then join.
for df in dfs:
    df['seq_ids'] = df.apply(lambda row: ','.join(sorted([row['seq1_id'], row['seq2_id']])), axis=1)
    df.drop(['seq1_id', 'seq2_id'], axis=1, inplace=True)
    df.set_index('seq_ids', inplace=True)

In [9]:
# Combine data into one dataframe.
df = pd.concat(dfs, axis=1)

In [10]:
df.dropna(how='any', inplace=True)

In [11]:
df.shape[0]

491536

In [12]:
df.head()

Unnamed: 0_level_0,TM-align,TM-Vec,TM-Vec 2,TM-Vec Student
seq_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"107lA00,108lA00",0.99929,0.9999,0.985609,0.999887
"107lA00,109lA00",0.99945,0.999929,0.990814,0.999898
"107lA00,110lA00",0.99947,0.99992,0.987689,0.999887
"107lA00,111lA00",0.99929,0.99996,0.996974,0.999885
"107lA00,112lA00",0.99892,0.999958,0.992665,0.999874


In [13]:
df.to_csv('tmvecs.csv', sep='\t')

In [14]:
cath = pd.read_table('truth.tsv')
cath

Unnamed: 0,a,b,class,architecture,topology,superfamily
0,107lA00,108lA00,1,1,1,1
1,107lA00,109lA00,1,1,1,1
2,107lA00,110lA00,1,1,1,1
3,107lA00,111lA00,1,1,1,1
4,107lA00,112lA00,1,1,1,1
...,...,...,...,...,...,...
499495,1b7jA00,1b7lA00,0,0,0,0
499496,1b7jA00,1b7mA00,0,0,0,0
499497,1b7kA00,1b7lA00,0,0,0,0
499498,1b7kA00,1b7mA00,0,0,0,0


In [152]:
cath['pair'] = cath['a'] + ',' + cath['b']

In [153]:
cath.set_index('pair', inplace=True)

In [156]:
cath.drop(columns=['a', 'b'], inplace=True)

In [162]:
conc = pd.concat([cath, df], axis=1).dropna()
conc

Unnamed: 0,class,architecture,topology,superfamily,TM-align,TM-Vec,TM-Vec 2,TM-Vec Student
"107lA00,108lA00",1,1,1,1,0.99929,0.999900,0.985609,0.999887
"107lA00,109lA00",1,1,1,1,0.99945,0.999929,0.990814,0.999898
"107lA00,110lA00",1,1,1,1,0.99947,0.999920,0.987689,0.999887
"107lA00,111lA00",1,1,1,1,0.99929,0.999960,0.996974,0.999885
"107lA00,112lA00",1,1,1,1,0.99892,0.999958,0.992665,0.999874
...,...,...,...,...,...,...,...,...
"1b7jA00,1b7lA00",0,0,0,0,0.26842,0.480856,0.424563,0.326075
"1b7jA00,1b7mA00",0,0,0,0,0.26723,0.490731,0.435040,0.325220
"1b7kA00,1b7lA00",0,0,0,0,0.27057,0.478964,0.352078,0.320446
"1b7kA00,1b7mA00",0,0,0,0,0.26516,0.489424,0.350214,0.319531


In [164]:
conc.to_csv('tmvecs.tsv', sep='\t')

In [163]:
df.shape

(491536, 4)