In [1]:
import os
import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import numba

In [2]:
# 文件夹路径
folder_path = '/xiongjun/test/MIL/share/MixResult_UID_All'
# 获取所有文件名
file_names = glob.glob(os.path.join(folder_path, '*.txt'))


file_names = file_names
print(len(file_names))

1076


In [3]:
# 读取txt文件
merged_df = pd.read_csv("/xiongjun/test/MIL/result.txt", sep="\s+", header=None, names=["code", "number", "label"])

# 删除"number"列
merged_df = merged_df.drop(columns=["number"])
merged_df.loc[merged_df['label'] == 'prostate(lung)', 'label'] = 'lung'
merged_df = merged_df[merged_df['label'] != 'breast（wrong）']
merged_df.head()

Unnamed: 0,code,label
0,H2001A008,breast
1,H2001A010,breast
2,H2001A011,breast
3,H2001A016,breast
4,H2001A017,breast


In [4]:
# 定义数据
data = "A,-0.591,-1.302,-0.733,1.570,-0.146;C,-1.343,0.465,-0.862,-1.020,-0.255;D,1.050,0.302,-3.656,-0.259,-3.242;E,1.357,-1.453,1.477,0.113,-0.837;F,-1.006,-0.590,1.891,-0.397,0.412;G,-0.384,1.652,1.330,1.045,2.064;H,0.336,-0.417,-1.673,-1.474,-0.078;I,-1.239,-0.547,2.131,0.393,0.816;K,1.831,-0.561,0.533,-0.277,1.648;L,-1.019,-0.987,-1.505,1.266,-0.912;M,-0.663,-1.524,2.219,-1.005,1.212;N,0.945,0.828,1.299,-0.169,0.933;P,0.189,2.081,-1.628,0.421,-1.392;Q,0.931,-0.179,-3.005,-0.503,-1.853;R,1.538,-0.055,1.502,0.440,2.897;S,-0.228,1.399,-4.760,0.670,-2.647;T,-0.032,0.326,2.213,0.908,1.313;V,-1.337,-0.279,-0.544,1.242,-1.262;W,-0.595,0.009,0.672,-2.128,-0.184;Y,0.260,0.830,3.097,-0.838,1.512"

# 将数据分割成列表
data_list = [item.split(',') for item in data.split(';')]

# 创建DataFrame
atchley = pd.DataFrame(data_list, columns=["amino.acid", "f1", "f2", "f3", "f4", "f5"])

# 将f1-f5列转换为数值类型
atchley[["f1", "f2", "f3", "f4", "f5"]] = atchley[["f1", "f2", "f3", "f4", "f5"]].apply(pd.to_numeric)

# 将amino.acid列设置为索引
atchley.set_index("amino.acid", inplace=True)
atchley

Unnamed: 0_level_0,f1,f2,f3,f4,f5
amino.acid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,-0.591,-1.302,-0.733,1.57,-0.146
C,-1.343,0.465,-0.862,-1.02,-0.255
D,1.05,0.302,-3.656,-0.259,-3.242
E,1.357,-1.453,1.477,0.113,-0.837
F,-1.006,-0.59,1.891,-0.397,0.412
G,-0.384,1.652,1.33,1.045,2.064
H,0.336,-0.417,-1.673,-1.474,-0.078
I,-1.239,-0.547,2.131,0.393,0.816
K,1.831,-0.561,0.533,-0.277,1.648
L,-1.019,-0.987,-1.505,1.266,-0.912


In [5]:
atchley_dict = {row[0]: list(row[1:]) for row in atchley.itertuples()}

In [6]:
values = merged_df['label'].unique().tolist()
print(values)

['breast', 'prostate', 'lung', 'liver', 'pancreas', 'colorectal', 'health']


In [7]:
# 定义一个函数，将氨基酸序列转换为数值列表
def sequence_to_values(sequence):
    return [atchley_dict[amino_acid] for amino_acid in sequence]

In [8]:
def calRA(raw_data, sample_name, keep, types, RA_save_dir ,amino_acids=set('ACDEFGHIKLMNPQRSTVWY')):
    if len(raw_data) <= 1000:
        return
    raw_data['aaSeqCDR3_length'] = raw_data['aaSeqCDR3'].str.len()

    # 计算所有aaSeqCDR3长度相同的行中的cloneCount的和
    grouped = raw_data.groupby('aaSeqCDR3_length')['cloneFraction'].sum()

    # 对grouped进行排序
    sorted_grouped = grouped.sort_values(ascending=False)

    # 计算累积和
    cumsum = sorted_grouped.cumsum()

    # 找到保持原来cloneCount总数100*keep%以上的那些行
    mask = cumsum <= cumsum.iloc[-1] * keep
    if not mask.any():  # 如果 mask 全为 False
        mask.iloc[0] = True  # 将第一行的值设为 True
    filtered_grouped = sorted_grouped[mask]
    data_filtered = raw_data[raw_data['aaSeqCDR3_length'].isin(filtered_grouped.index)]
    num_rows_filtered = len(data_filtered)
    max_length = data_filtered['aaSeqCDR3_length'].max()
    del raw_data
    # 去掉第一个和最后三个值
    data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
    # 找到aaSeqCDR3中元素的最大长度
    max_length = data_filtered['aaSeqCDR3'].str.len().max()
    # 使用'-'在末尾进行填充到aaSeqCDR3中元素的最大长度
    data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
    length = len(data_filtered['aaSeqCDR3'].iloc[0])

    # 对于每个4-mer序列
    for i in range(length - 3):
        # 创建新的列
        data_filtered[f'4-mer-{i+1}'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[i:i+4] if set(x[i:i+4]).issubset(amino_acids) else np.nan)

    
    df_backup = data_filtered.copy()

    # 找到所有的 '4-mer' 列
    four_mer_columns = data_filtered.filter(regex='4-mer')

    # 将 '4-mer' 列和 'ratio' 列合并
    df_melted = pd.melt(data_filtered, id_vars='cloneFraction', value_vars=four_mer_columns.columns, var_name='4-mer_col', value_name='4-mer')
    # 删除4-mer列中的NaN值
    df_melted = df_melted.dropna(subset=['4-mer'])
    # 计算每种4-mer的最大ratio值
    TCR_RA_stats = df_melted.groupby('4-mer')['cloneFraction'].max().reset_index()
    TCR_RA_stats.columns = ['4-mer', 'RA']
    # if sample_counts % save_interval == 0:
    #     sns.set_theme(style="whitegrid")
    #     # 创建图形和轴
    #     plt.figure(figsize=(10, 6))
    #     # 绘制直方图
    #     sns.histplot(TCR_RA_stats['RA'], kde=True, color="skyblue")
    #     # 添加一些标题和标签
    #     plt.title('TCR RA Distribution')
    #     plt.xlabel('RA')
    #     plt.ylabel('Frequency')
    #     figsave_dir =os.path.join('share', 'RA', 'figs' , types)
    #     if not os.path.exists(figsave_dir):
    #         os.makedirs(figsave_dir)
    #     plt.savefig(os.path.join(figsave_dir, sample_name + '_TCR_RA_dist.png'))


    data_filtered = df_backup
    # 初始化一个字典来存储每种4-mer序列的相对丰度值
    relative_abundance = {}

    # 遍历每一行数据
    for index, row in data_filtered.iterrows():
        # 遍历除了最后一列（cloneFraction）以外的所有列
        for col in four_mer_columns.columns:
            # 如果该列的值不是NaN，则将其相对丰度值累加到相应的键中
            if not pd.isna(row[col]):
                # 使用setdefault方法来初始化字典中键的默认值为0，然后累加cloneFraction值
                relative_abundance.setdefault(row[col], 0)
                relative_abundance[row[col]] += row['cloneFraction']



    _4mer_RA_stats = pd.DataFrame(list(relative_abundance.items()), columns=['4-mer', 'RA'])
    sumra = _4mer_RA_stats['RA'].sum()
    _4mer_RA_stats['RA'] = _4mer_RA_stats['RA'] / sumra
    # flattened_df = flatten_data(data_filtered, length)
    # flattened_data = []
    # for index, row in data_filtered.iterrows():
    #     # 获取当前行的氨基酸序列，并去除重复项和NaN
    #     unique_aaseqs = set(row[f'4-mer-{i}'] for i in range(1, length - 2) if pd.notnull(row[f'4-mer-{i}']))
    #     for aaseq in unique_aaseqs:
    #         flattened_data.append({'4-mer': aaseq, 'cloneFraction': row['cloneFraction']})
    # flattened_df = pd.DataFrame(flattened_data)
    # C_values = flattened_df.groupby('4-mer')['cloneFraction'].sum().reset_index()\
    #     .set_index('4-mer')['cloneFraction'].squeeze()
    # sum_value = C_values.sum()
    # RA_values = C_values / sum_value
    # # 创建结果DataFrame
    # _4mer_RA_stats = pd.DataFrame({'4-mer': RA_values.index, 'RA': RA_values.values})

    # if sample_counts % save_interval == 0:
    #     sns.set_theme(style="whitegrid")
    #     # 创建图形和轴
    #     plt.figure(figsize=(10, 6))

    #     # 绘制直方图
    #     sns.histplot(_4mer_RA_stats['RA'], kde=True, color="skyblue")


    #     # 添加一些标题和标签
    #     plt.title('4-mer RA Distribution')
    #     plt.xlabel('RA')
    #     plt.ylabel('Frequency')
    #     figsave_dir =os.path.join('share', 'RA', 'figs' , types)
    #     if not os.path.exists(figsave_dir):
    #         os.makedirs(figsave_dir)
    #     plt.savefig(os.path.join(figsave_dir, sample_name + '_4mer_RA_dist.png'))
    
    RA_stats = _4mer_RA_stats.merge(TCR_RA_stats, on='4-mer', suffixes=('_4mer', '_TCR'))

    # 使用 '4-mer' 列的值创建新的列
    RA_stats['data'] = RA_stats['4-mer'].apply(sequence_to_values)
    RA_stats['sample'] = sample_name
    label = values.index(types)
    RA_stats['label'] = label
    saved_folder = os.path.join('data', RA_save_dir, str(label))
    if not os.path.exists(saved_folder):
        os.makedirs(saved_folder)
    RA_stats.set_index('4-mer', inplace=True)
    RA_stats.to_csv(os.path.join(saved_folder, sample_name + '_RA_stats.csv'))

In [9]:
def readRA(filename, keep, missingfiles):
    # 读取文件
    path_parts = os.path.split(filename)
    name = path_parts[1]
    raw_data = pd.read_csv(filename, sep='\t')
    raw_data['sample'] = name.split('.')[0] 
    if not merged_df['code'].isin([name.split('.')[0]]).any():
        print(f'{name} is not in the name2code file')
        missingfiles.append(name)
        return
    types = merged_df[merged_df['code'] == name.split('.')[0]]['label'].values[0]
    # 定义20个氨基酸字符
    amino_acids = set('ACDEFGHIKLMNPQRSTVWY')
    # 创建一个集合，用于收集所有不满足条件的字符
    invalid_chars = set()

    # 检查aaSeqCDR3列中的所有值，如果一个值包含不满足条件的字符，那么将这些字符添加到invalid_chars集合中
    raw_data['aaSeqCDR3'].apply(lambda seq: invalid_chars.update(set(seq) - amino_acids))
    # 创建一个新的列来存储aaSeqCDR3列中元素的长度
    df_group = raw_data.groupby('sample')
    RA_save = str(keep) + '_RA'
    for  sample_id, (sample_name, df) in enumerate(df_group):
        calRA(df, sample_name, keep, types, RA_save)

In [10]:
""" def readRA(filename, keep):
    # 读取文件
    raw_data = pd.read_csv(filename, sep=',', header=None)
    raw_data.columns = ['aaSeqCDR3', 'cloneCount', 'id', 'sample', 'sth']
    # 定义20个氨基酸字符
    amino_acids = set('ACDEFGHIKLMNPQRSTVWY')

    # 创建一个集合，用于收集所有不满足条件的字符
    invalid_chars = set()

    # 检查aaSeqCDR3列中的所有值，如果一个值包含不满足条件的字符，那么将这些字符添加到invalid_chars集合中
    raw_data['aaSeqCDR3'].apply(lambda seq: invalid_chars.update(set(seq) - amino_acids))
    # 创建一个新的列来存储aaSeqCDR3列中元素的长度
    
    df_group = raw_data.groupby('sample')
    
    raw_data['aaSeqCDR3_length'] = raw_data['aaSeqCDR3'].str.len()

    # 计算所有aaSeqCDR3长度相同的行中的cloneCount的和
    grouped = raw_data.groupby('aaSeqCDR3_length')['cloneCount'].sum()
    grouped.plot(kind='bar')
    plt.xlabel('Length of aaSeqCDR3')
    plt.ylabel('Sum of cloneCount')
    plt.title('abundance for each Length of aaSeqCDR3')
    path_parts = os.path.split(filename)
    sample_name = path_parts[1].split('.')[0]
    types = path_parts[0].split('/')[-2]
    plt.savefig(os.path.join('Data', 'RA',  'figs', path_parts[1].replace('.clonotypes.TRB.txt', 'TCR_length_dist.png')))

    # 对grouped进行排序
    sorted_grouped = grouped.sort_values(ascending=False)

    # 计算累积和
    cumsum = sorted_grouped.cumsum()

    # 找到保持原来cloneCount总数100*keep%以上的那些行
    mask = cumsum <= cumsum.iloc[-1] * keep
    filtered_grouped = sorted_grouped[mask]
    data_filtered = raw_data[raw_data['aaSeqCDR3_length'].isin(filtered_grouped.index)]
    num_rows_filtered = len(data_filtered)
    max_length = data_filtered['aaSeqCDR3_length'].max()
    del raw_data
    # 去掉第一个和最后三个值
    data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
    # 找到aaSeqCDR3中元素的最大长度
    max_length = data_filtered['aaSeqCDR3'].str.len().max()
    # 使用'-'在末尾进行填充到aaSeqCDR3中元素的最大长度
    data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
    length = len(data_filtered['aaSeqCDR3'].iloc[0])

    # 对于每个4-mer序列
    for i in range(length // 4):
        # 创建新的列
        data_filtered[f'4-mer-{i+1}'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[i*4:(i+1)*4] if set(x[i*4:(i+1)*4]).issubset(amino_acids) else np.nan)

    df_backup = data_filtered.copy()
    # 选择所有的 '4-mer' 列
    four_mer_columns = data_filtered.filter(regex='4-mer')
    # 选择 'aaSeqCDR3' 列和 'cloneCount' 列
    other_columns = data_filtered[['aaSeqCDR3', 'cloneCount']]
    # 合并所有的列
    data_filtered = pd.concat([four_mer_columns, other_columns], axis=1)

    cloneCount_sum = data_filtered.groupby('aaSeqCDR3')['cloneCount'].sum()
    data_filtered['ratio'] = data_filtered['cloneCount'] / data_filtered['aaSeqCDR3'].map(cloneCount_sum)
    flattened_data = []
    for index, row in data_filtered.iterrows():
        # 获取当前行的氨基酸序列，并去除重复项和NaN
        unique_aaseqs = set(row[f'4-mer-{i}'] for i in range(1, length//4 + 1) if pd.notnull(row[f'4-mer-{i}']))
        for aaseq in unique_aaseqs:
            flattened_data.append({'4-mer': aaseq, 'RA': row['ratio']})
    flattened_df = pd.DataFrame(flattened_data)
    TCR_RA_stats = flattened_df.groupby('4-mer')['RA'].max().reset_index()\
        .set_index('4-mer')['RA'].squeeze()
    TCR_RA_stats = TCR_RA_stats.to_frame().reset_index()
    TCR_RA_stats.columns = ['4-mer', 'RA']
    sns.set_theme(style="whitegrid")
    # 创建图形和轴
    plt.figure(figsize=(10, 6))

    # 绘制直方图
    sns.histplot(TCR_RA_stats['RA'], kde=True, color="skyblue")


    # 添加一些标题和标签
    plt.title('TCR RA Distribution')
    plt.xlabel('RA')
    plt.ylabel('Frequency')
    plt.savefig(os.path.join('Data', 'RA', 'figs' ,path_parts[1].replace('.clonotypes.TRB.txt', '_TCR_RA_dist.png')))

    ############################################################################################################
    data_filtered = df_backup
    # 选择所有的 '4-mer' 列
    four_mer_columns = data_filtered.filter(regex='4-mer')
    # 选择 'aaSeqCDR3' 列和 'cloneCount' 列
    other_columns = data_filtered[['cloneCount']]
    # 合并所有的列
    data_filtered = pd.concat([four_mer_columns, other_columns], axis=1)
    long_df = pd.melt(data_filtered, id_vars='cloneCount', value_vars=four_mer_columns.columns, var_name='4-mer_col', value_name='4-mer').dropna()
    # 计算每个4-mer的T值
    T_values = long_df.groupby('4-mer')['cloneCount'].sum()
    flattened_data = []
    for index, row in data_filtered.iterrows():
        # 获取当前行的氨基酸序列，并去除重复项和NaN
        unique_aaseqs = set(row[f'4-mer-{i}'] for i in range(1, length//4 + 1) if pd.notnull(row[f'4-mer-{i}']))
        for aaseq in unique_aaseqs:
            flattened_data.append({'4-mer': aaseq, 'cloneCount': row['cloneCount']})
    flattened_df = pd.DataFrame(flattened_data)
    C_values = flattened_df.groupby('4-mer')['cloneCount'].sum().reset_index()\
        .set_index('4-mer')['cloneCount'].squeeze()
    # 计算RA值
    RA_values = C_values / T_values
    # 创建结果DataFrame
    _4mer_RA_stats = pd.DataFrame({'4-mer': RA_values.index, 'RA': RA_values.values})
    # 绘制直方图
    sns.histplot(_4mer_RA_stats['RA'], kde=True, color="skyblue")


    # 添加一些标题和标签
    plt.title('4-mer RA Distribution')
    plt.xlabel('RA')
    plt.ylabel('Frequency')
    plt.savefig(os.path.join('Data', 'RA', 'figs', path_parts[1].replace('.clonotypes.TRB.txt', '_4mer_RA_dist.png')))
    
    RA_stats = _4mer_RA_stats.merge(TCR_RA_stats, on='4-mer', suffixes=('_4mer', '_TCR'))

    # 使用 '4-mer' 列的值创建新的列
    RA_stats['data'] = RA_stats['4-mer'].apply(sequence_to_values)
    RA_stats['sample'] = sample_name
    if types == 'cancer':
        label = 1
    else:
        label = 0
    RA_stats['label'] = label
    saved_folder = os.path.join('Data', 'RA', str(label))
    if not os.path.exists(saved_folder):
        os.makedirs(saved_folder)
    RA_stats.set_index('4-mer', inplace=True)
    RA_stats.to_csv(os.path.join(saved_folder, path_parts[1].replace('.clonotypes.TRB.txt', '_RA_stats.csv')))
    
 """

' def readRA(filename, keep):\n    # 读取文件\n    raw_data = pd.read_csv(filename, sep=\',\', header=None)\n    raw_data.columns = [\'aaSeqCDR3\', \'cloneCount\', \'id\', \'sample\', \'sth\']\n    # 定义20个氨基酸字符\n    amino_acids = set(\'ACDEFGHIKLMNPQRSTVWY\')\n\n    # 创建一个集合，用于收集所有不满足条件的字符\n    invalid_chars = set()\n\n    # 检查aaSeqCDR3列中的所有值，如果一个值包含不满足条件的字符，那么将这些字符添加到invalid_chars集合中\n    raw_data[\'aaSeqCDR3\'].apply(lambda seq: invalid_chars.update(set(seq) - amino_acids))\n    # 创建一个新的列来存储aaSeqCDR3列中元素的长度\n    \n    df_group = raw_data.groupby(\'sample\')\n    \n    raw_data[\'aaSeqCDR3_length\'] = raw_data[\'aaSeqCDR3\'].str.len()\n\n    # 计算所有aaSeqCDR3长度相同的行中的cloneCount的和\n    grouped = raw_data.groupby(\'aaSeqCDR3_length\')[\'cloneCount\'].sum()\n    grouped.plot(kind=\'bar\')\n    plt.xlabel(\'Length of aaSeqCDR3\')\n    plt.ylabel(\'Sum of cloneCount\')\n    plt.title(\'abundance for each Length of aaSeqCDR3\')\n    path_parts = os.path.split(filename)\n    sample_name = path_part

In [11]:
# 处理所有数据
missingfiles = []
for file_name in tqdm(file_names):
    readRA(file_name, 0.9, missingfiles)

  0%|          | 0/1076 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001B315.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001E109.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H565.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001I070.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H517.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H023.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H522.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H557.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

H2001H561.clonotypes.TRB.txt is not in the name2code file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x[1:-3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['aaSeqCDR3'] = data_filtered['aaSeqCDR3'].apply(lambda x: x.ljust(max_length, '-'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[f'4-mer-{i+1

In [12]:
print(missingfiles)

['H2001B315.clonotypes.TRB.txt', 'H2001E109.clonotypes.TRB.txt', 'H2001H565.clonotypes.TRB.txt', 'H2001I070.clonotypes.TRB.txt', 'H2001H517.clonotypes.TRB.txt', 'H2001H023.clonotypes.TRB.txt', 'H2001H522.clonotypes.TRB.txt', 'H2001H557.clonotypes.TRB.txt', 'H2001H561.clonotypes.TRB.txt']
