-
Notifications
You must be signed in to change notification settings - Fork 1
/
stcr_pairs.py
89 lines (73 loc) · 2.62 KB
/
stcr_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# By Neerja Thakkar for "Balancing sensitivity and specificity in
# distinguishing TCR groups by CDR sequence similarity"
# See README for license information
__author__ = 'neerjathakkar'
import sys
import csv
from sw_scoring import getDistanceSW
from parse_sequences import txttoseqlist
# inputs (from command line): list of cluster files, directory, title
if len(sys.argv) < 3:
print("need the following arguments: output directory, a3 or b3, repertoire files")
sys.exit()
else:
output_directory = sys.argv[1]
cdr_type = sys.argv[2]
repertoire_file_list = sys.argv[3].split(',')
all_seqs = []
# read in the clusters
seq_to_rep = {}
rep_to_seq = {}
i = 0
for file in repertoire_file_list:
i += 1
content = txttoseqlist(file, trim_first=False)
print(file)
print(len(content))
rep_to_seq[i] = content
all_seqs += content
for cdr in content:
if cdr in seq_to_rep:
found = False
for nums in seq_to_rep[cdr]:
if nums == i:
found = True
if not found:
seq_to_rep[cdr].append(i)
else:
seq_to_rep[cdr] = [i]
# closest distances
# [cdr1, cdr1 cluster, cdr2, cdr2 cluster, distance]
closest_distances = []
for curr_i in rep_to_seq:
# extract cluster
cluster = rep_to_seq[curr_i]
# find smallest within-cluster distance
for cdr1 in cluster:
min_distance = 2.0
min_distance_cdr = None
for cdr2 in cluster:
if cdr1 != cdr2:
dist = getDistanceSW(cdr1, cdr2)
if dist < min_distance:
min_distance = dist
min_distance_cdr = cdr2
closest_distances.append([cdr1, curr_i, min_distance_cdr, curr_i, min_distance])
# find smallest between-cluster distance
for cdr1 in cluster:
min_distance = 2.0
min_distance_cdr = None
min_distance_cluster = None
for structural_cluster_i in rep_to_seq:
if structural_cluster_i != curr_i:
for cdr2 in rep_to_seq[structural_cluster_i]:
dist = getDistanceSW(cdr1, cdr2)
if dist < min_distance:
min_distance = dist
min_distance_cdr = cdr2
min_distance_cluster = structural_cluster_i
closest_distances.append([cdr1, curr_i, min_distance_cdr, min_distance_cluster, min_distance])
file = output_directory + str(cdr_type) + "_stcr_pairs.csv"
with open(file, "w") as output_full:
writer = csv.writer(output_full, lineterminator='\n')
writer.writerows(closest_distances)