In [5]:
import pandas as pd

def extract_sequences(input_file):
    input_excel_file = pd.read_excel(input_file, sheet_name=0) # 10.31的抗体数据在第1个sheet中
    input_ab_sequences_data = input_excel_file.iloc[:, 2].tolist() # 只有一列抗体数据，在第3列，不区分轻重链

    with open('fasta_ab_1031.fasta', 'w') as ab_file:
        for i, sequence in enumerate(input_ab_sequences_data):
            ab_file.write(f'>Sequence{i + 1}\n')
            # ab_file.write(f'{sequence}\t {len(sequence)}\n')
            ab_file.write(f'{sequence}\n')

extract_sequences('/Users/yscao/mmeng/deeplearning/ANARCI/CDR_annotation/optimizeScript/Ab_sequences_1031.xlsx')

In [33]:
# seq = 'DSVTQTEGLVTVTEGLPVKLNCTYQTTYLTIAFFWYVQYLNEAPQVLLKSSTDNKRTEHQGFHATLHKSSSSFHLQKSSAQLSDSALYYCALSEGGNYKYVFGAGTRLKVIAHIQNPEPAVYQLKDPRSQDSTLCLFTDFDSQINVPKTMESGTFITDKTVLDMKAMDSKSNGAIAWSNQTSFTCQDIFKETNATYPSSDVPC'

# seq = 'MSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENGGGSGGKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN'

# C_positions = [i+1 for i in range(len(seq)) if seq[i]=='C']
# print(f'number of Cys: {len(C_positions)} \n Cys positions: {C_positions}\n')

# W_positions = [i+1 for i in range(len(seq)) if seq[i]=='W']
# print(f'number of W: {len(W_positions)} \n W positions: {W_positions}\n')

idx_sequence = 0

# 统计C/W的个数及位置信息，输出不满足个数要求的序列及其信息，返回满足个数要求的序列及其信息
def get_CW_info(seq, bad_num=0): 
    # idx_sequence += 1
    C_positions = [i+1 for i in range(len(seq)) if seq[i]=='C']
    W_positions = [i+1 for i in range(len(seq)) if seq[i]=='W']
    num_C = len(C_positions)
    num_W = len(W_positions)
    if num_C < 2 or num_W < 1:
        bad_num += 1
        print(f'the number of C/W is not qualified')
        # print(f'In Sequence: {seq}\t length: {len(seq)}')
        print(f'number of Cys: {len(C_positions)}\npositions: {C_positions}')
        print(f'number of W: {len(W_positions)}\npositions: {W_positions}\n')
        return None, None
    else: 
        # print(f'In Sequence: {seq}\t length: {len(seq)}')
        return C_positions, W_positions
    
def cal_distance(C_list, W_list): # 参数是位置列表
    C1C2_distance = [C_list[i] - C_list[i-1] for i in range(1, len(C_list))]
    C1W_distance = []
    WC2_distance = []

    for i in range(len(C_list)-1):
        for j in range(len(W_list)):
            c1w = W_list[j] - C_list[i]
            wc2 = C_list[i+1] - W_list[j]
            C1W_distance.append(c1w)
            WC2_distance.append(wc2)
    return C1C2_distance, C1W_distance, WC2_distance
        

def in_range(c1c2, distance_range):
    for dis in c1c2:
        if distance_range[0] <= dis <= distance_range[1]:
            return True
    return False


def main(input_fasta_file): # 输入fasta文件

    distance_range = {
        'c1c2': (60, 82),
        'c1w':  (10, 19),
        'wc2':  (50, 64)
    }

    with open(input_fasta_file, 'r') as in_file:
        for line in in_file:
            if  line.startswith('>Sequence'):
                pass
                # print(f'In {line}')
            else:
                print(f'*****Sequence: {line}******')
                C_positions, W_positions = get_CW_info(line)
                if  C_positions == None and  W_positions == None:
                    pass
                else:
                    c1c2_dis, c1w_dis, wc2_dis = cal_distance(C_positions, W_positions)
                    if in_range(c1c2_dis, distance_range['c1c2']) and in_range(c1w_dis, distance_range['c1w']) and in_range(wc2_dis, distance_range['wc2']):
                        print(f'qualify the number of C/W and the distance between them')
                        print(f'C positions: {C_positions}\nW positions: {W_positions}\n')
                        print(f'c1c2_dis: {c1c2_dis}\nc1w_dis: {c1w_dis}\nwc2_dis: {wc2_dis}\n')
                    else:
                        print(f'C_positions: {C_positions}\nW_positions: {W_positions}')
                        print(f'c1c2_distance: {c1c2_dis}\n c1w_distance: {c1w_dis} \n wc2_distance: {wc2_dis}\n\n')


if __name__ == '__main__':

    input_fasta_file = '/Users/yscao/mmeng/deeplearning/ANARCI/CDR_annotation/optimizeScript/fasta_ab_1031.fasta'

    main(input_fasta_file)

The number of C/W is not qualified
In Sequence: AKQNVSSLDEKNSVSVDLPGEMKVLVSKEKNKDGKYDLIATVDKLELKGTSDKNNGSGVLEGVKADKCKVKLTISDDLGQTTLEVFKEDGKTLVSKKVTSKDKSSTEEKFNEKGEVSEKIITRADGTRLEYTGIKSDGSGKAKEVLKGYVLEGTLTAEKTTLVVKEGTVTLSKNISKSGEVSVELNDTDSSAATKKTAAWNSGTSTLTITVNSKKTKDLVFTKENTITVQQYDSNGTKLEGSAVEITKLDEIKNALK
	 length: 258
number of Cys: 1 
  positions: [68]
number of W: 1 
  positions: [200]

The number of C/W is not qualified
In Sequence: MSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENGGGSGGKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN
	 length: 161
number of Cys: 4 
  positions: [6, 56, 102, 108]
number of W: 0 
  positions: []

Line: DSVTQTEGLVTVTEGLPVKLNCTYQTTYLTIAFFWYVQYLNEAPQVLLKSSTDNKRTEHQGFHATLHKSSSSFHLQKSSAQLSDSALYYCALSEGGNYKYVFGAGTRLKVIAHIQNPEPAVYQLKDPRSQDSTLCLFTDFDSQINVPKTMESGTFITDKTVLDMKAMDSKSNGAIAWSNQTSFTCQDIFKETNATYPSSDVPC
 qualify the number of C/W and the distance between them

The number of C/W is not qualified
