In [1]:
# Load the required libraries
import pandas as pd
import numpy as np
import os
from IPython.display import display

# Define the right substring function
def substrRight(x, n):
    return x[-n:]

# Input file names
input_excel = "adi.xlsx"
avref_csv = "AVref.txt"
bvref_csv = "BVref.txt"
ajref_csv = "AJrefv2.txt"
bjref_csv = "BJref.txt"

# Function to detect delimiter (not needed for Excel)
def detect_delimiter(file_path):
    return None  # Remove this function as delimiter detection is for CSV

# Load data using read_excel for Excel file and read_csv for CSV files
MS_tcrs = pd.read_excel(input_excel, sheet_name='adi')  # Adjust sheet_name as needed
print("Initial MS_tcrs:")
display(MS_tcrs.head())

# Display the column names to check for unexpected spaces or formatting issues
print("\nColumn names in the initial DataFrame:")
print(MS_tcrs.columns)

# Clean up column names by stripping any leading/trailing whitespace
MS_tcrs.columns = MS_tcrs.columns.str.strip()

# Verify column names after stripping whitespace
print("\nColumn names after stripping whitespace:")
print(MS_tcrs.columns)

# Ensure all necessary columns exist
required_columns = ['TRAV', 'TRBV', 'TRBJ', 'TRAJ', 'CDR3a', 'CDR3b', 'patient_id', 'subset']  # Include subset in required columns
missing_columns = [col for col in required_columns if col not in MS_tcrs.columns]

if missing_columns:
    print(f"\nError: Missing columns in the input data - {missing_columns}")
else:
    # Clean up the TCR sequence table
    MS_tcrs['TRAV'] = MS_tcrs['TRAV'].str.replace("/", "")
    MS_tcrs['TRBV'] = MS_tcrs['TRBV'] + "*01"
    MS_tcrs['TRBJ'] = MS_tcrs['TRBJ'] + "*01"
    MS_tcrs['TRAV'] = MS_tcrs['TRAV'] + "*01"

    # Update TRAJ column based on conditions
    def update_TRAJ(row):
        if row['TRAJ'] in ['TRAJ13', 'TRAJ15', 'TRAJ23', 'TRAJ24', 'TRAJ32', 'TRAJ37', 'TRAJ47']:
            return [row['TRAJ'] + "*01", row['TRAJ'] + "*02"]
        else:
            return [row['TRAJ'] + "*01"]

    # Apply the function to update TRAJ and expand rows
    MS_tcrs['TRAJ'] = MS_tcrs.apply(update_TRAJ, axis=1)

    # Duplicate rows to accommodate both versions of TRAJ
    MS_tcrs_expanded = MS_tcrs.explode('TRAJ').reset_index(drop=True)

    print("\nExpanded MS_tcrs with both TRAJ versions:")
    display(MS_tcrs_expanded.head())

    # Load TCR gene segment references from CSV files
    AVref = pd.read_csv(avref_csv, sep="\t")  # Assuming AVref.txt is tab-separated
    BVref = pd.read_csv(bvref_csv, sep="\t")  # Assuming BVref.txt is tab-separated
    AJref = pd.read_csv(ajref_csv, sep="\t")  # Assuming AJrefv2.txt is tab-separated
    BJref = pd.read_csv(bjref_csv, sep="\t")  # Assuming BJref.txt is tab-separated

    # Inspect AJref to check the columns
    print("\nAJref columns:")
    display(AJref.columns)

    # Combine columns correctly in AJref if 'X' exists
    if 'X' in AJref.columns:
        AJref['TRAJseq'] = AJref['TRAJseq'] + AJref['X']
    AJref = AJref[['AlphaChain_JGene', 'TRAJseq']]
    print("\nUpdated AJref:")
    display(AJref.head())

    # Rename columns for consistency
    AVref.rename(columns={'AlphaChain_VGene': 'TRAV'}, inplace=True)
    BVref.rename(columns={'BetaChain_VGene': 'TRBV'}, inplace=True)
    AJref.rename(columns={'AlphaChain_JGene': 'TRAJ'}, inplace=True)
    BJref.rename(columns={'BetaChain_JGene': 'TRBJ'}, inplace=True)

    print("\nRenamed AVref:")
    display(AVref.head())
    print("\nRenamed BVref:")
    display(BVref.head())
    print("\nRenamed AJref:")
    display(AJref.head())
    print("\nRenamed BJref:")
    display(BJref.head())

    # Sequences of mouse constant regions and the furin p2a linker
    MouseTRBC = "EDLRNVTPPKVSLFEPSKAEIANKQKATLVCLARGFFPDHVELSWWVNGKEVHSGVCTDPQAYKESNYSYCLSSRLRVSATFWHNPRNHFRCQVQFHGLSEEDKWPEGSPKPVTQNISAEAWGRADCGITSASYQQGVLSATILYEILLGKATLYAVLVSTLVVMAMVKRKNS"
    FurinP2A = "RRKRSGSGATNFSLLKQAGDVEENPGP"

    # Join TCR table with both the V and J sequences from the references
    MS_tcrs_expanded = MS_tcrs_expanded.merge(BVref, on="TRBV", how="left") \
                                       .merge(BJref, on="TRBJ", how="left") \
                                       .merge(AVref, on="TRAV", how="left") \
                                       .merge(AJref, on="TRAJ", how="left")
    print("\nJoined MS_tcrs_expanded with references:")
    display(MS_tcrs_expanded.head())

    # Remove unnecessary columns
    MS_tcrs_expanded = MS_tcrs_expanded.drop(columns=['X.1', 'X'], errors='ignore')
    print("\nMS_tcrs_expanded after dropping unnecessary columns:")
    display(MS_tcrs_expanded.head())

    # Trim the V gene segments
    x = MS_tcrs_expanded['TRBVseq'].str.extract(r"(.*C)")
    y = MS_tcrs_expanded['TRAVseq'].str.extract(r"(.*C)")
    print("\nTrimmed V gene segments:")
    display(x.head())
    display(y.head())

    # Extract the last 3 amino acids of the CDR3 sequences
    cdrblast3 = MS_tcrs_expanded['CDR3b'].apply(lambda s: substrRight(s, 3))
    cdralast3 = MS_tcrs_expanded['CDR3a'].apply(lambda s: substrRight(s, 3))
    print("\nLast 3 amino acids of the CDR3 sequences:")
    display(cdrblast3.head())
    display(cdralast3.head())

    # Split TCRB J sequence and take the sequence that is not the last 3 amino acids of CDR3 sequence
    xx = MS_tcrs_expanded.apply(lambda row: row['TRBJseq'].split(cdrblast3[row.name])[1] if len(row['TRBJseq'].split(cdrblast3[row.name])) == 2 else '', axis=1)
    yy = MS_tcrs_expanded.apply(lambda row: row['TRAJseq'].split(cdralast3[row.name])[1] if len(row['TRAJseq'].split(cdralast3[row.name])) == 2 else '', axis=1)
    print("\nSplit TCRB J and TRAJ sequences:")
    display(xx.head())
    display(yy.head())

    # Verify the splits
    b2 = MS_tcrs_expanded.apply(lambda row: len(row['TRBJseq'].split(cdrblast3[row.name])) != 2, axis=1)
    a2 = MS_tcrs_expanded.apply(lambda row: len(row['TRAJseq'].split(cdralast3[row.name])) != 2, axis=1)
    short = (a2 | b2)
    look = MS_tcrs_expanded[short]
    print("\nSequences needing manual editing if any:")
    display(look)

    # Assemble the full sequence of the alpha and beta chain
    MS_tcrs_expanded['Bseq'] = x[0].str[:-1] + MS_tcrs_expanded['CDR3b'] + xx
    MS_tcrs_expanded['Bcomplete'] = MS_tcrs_expanded['Bseq'] + MouseTRBC
    MS_tcrs_expanded['Aseq'] = y[0].str[:-1] + MS_tcrs_expanded['CDR3a'] + yy
    print("\nFull sequence of alpha and beta chain:")
    display(MS_tcrs_expanded[['Bseq', 'Bcomplete', 'Aseq']].head())

    # Generate final synthesis sequence
    MS_tcrs_expanded['Synthesis'] = MS_tcrs_expanded['Bcomplete'] + FurinP2A + MS_tcrs_expanded['Aseq']
    print("\nFinal synthesis sequence:")
    display(MS_tcrs_expanded['Synthesis'].head())

    # Generate TCRCloneID information and check for duplicates
    MS_tcrs_expanded['TCRCloneID'] = MS_tcrs_expanded['patient_id'] + "_" + MS_tcrs_expanded['subset']+"_TCR_" + MS_tcrs_expanded['clonotype_id'].str.extract(r"(\d+)")[0]
    if MS_tcrs_expanded['TCRCloneID'].duplicated().any():
        print("Duplicate TCRCloneIDs found!")
    else:
        print("No duplicate TCRCloneIDs found.")

    # Create output file name based on input file name
    output_file = os.path.splitext(input_excel)[0] + "_assembled_TCRs_for_synthesis.xlsx"

    # Save the list for manual checking
    MS_tcrs_expanded.to_excel(output_file, index=False)
    display(MS_tcrs_expanded.head())

    # Display the saved data for verification
    print("\nSaved data for manual checking:")


Initial MS_tcrs:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset
0,1,clonotype1,p1001436-4,TRBV6-2,TRBJ2-3,CASSYWAGRPTDTQYF,TRAV6,TRAJ43,CAQAYNNNDMRF,CD4
1,2,clonotype2,p1001436-4,TRBV20-1,TRBJ2-1,CSARAMGSGVYNEQFF,TRAV23DV6,TRAJ10,CAASWDTGGGNKLTF,CD4
2,3,clonotype3,p1001436-4,TRBV20-1,TRBJ2-1,CSARAMGSGVYNEQFF,TRAV25,TRAJ24,CPSWGKLQF,CD4
3,4,clonotype4,p1001436-4,TRBV4-2,TRBJ2-3,CASSQERAGGSTDTQYF,TRAV6,TRAJ30,CALNRDDKIIF,CD4
4,5,clonotype5,p1035804-1,TRBV5-1,TRBJ2-7,CASKGLAGEYYEQYF,TRAV36DV7,TRAJ53,CAPIGGSNYKLTF,CD4



Column names in the initial DataFrame:
Index(['rank', 'clonotype_id', 'patient_id', 'TRBV', 'TRBJ', 'CDR3b', 'TRAV',
       'TRAJ', 'CDR3a', 'subset'],
      dtype='object')

Column names after stripping whitespace:
Index(['rank', 'clonotype_id', 'patient_id', 'TRBV', 'TRBJ', 'CDR3b', 'TRAV',
       'TRAJ', 'CDR3a', 'subset'],
      dtype='object')

Expanded MS_tcrs with both TRAJ versions:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4



AJref columns:


Index(['AlphaChain_JGene', 'TRAJseq', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')


Updated AJref:


Unnamed: 0,AlphaChain_JGene,TRAJseq
0,TRAJ1*01,YESITSQLQFGKGTRVSTSPM
1,TRAJ10*01,ILTGGGNKLTFGTGTQLKVELN
2,TRAJ11*01,NSGYSTLTFGKGTMLLVSPD
3,TRAJ12*01,MDSSYKLIFGSGTRLLVRPD
4,TRAJ13*01,NSGGYQKVTFGIGTKLQVIPN



Renamed AVref:


Unnamed: 0,TRAV,TRAVseq
0,TRAV1-1*01,MWGAFLLYVSMKMGGTAGQSLEQPSEVTAVEGAIVQINCTYQTSGF...
1,TRAV1-2*01,MWGVFLLYVSMKMGGTTGQNIDQPTEMTATEGAIVQINCTYQTSGF...
2,TRAV10*01,MKKHLTTFLVILWLYFYRGNGKNQVEQSPQSLIILEGKNCTLQCNY...
3,TRAV11*01,TEKPLGVSFLISSWQLCWVNRLHTLEQSPSFLNIQEGMHAVLNCTY...
4,TRAV12-1*01,MISLRVLLVILWLQLSWVWSQRKEVEQDPGPFNVPEGATVAFNCTY...



Renamed BVref:


Unnamed: 0,TRBV,TRBVseq
0,TRBV1*01,MG*SLHCGVVHCLRLHGYWNYPDTKIPGHSNGE*KDNET*ASGT*F...
1,TRBV10-1*01,MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTW...
2,TRBV10-1*02,MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTW...
3,TRBV10-2*01,MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTW...
4,TRBV10-3*01,MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTE...



Renamed AJref:


Unnamed: 0,TRAJ,TRAJseq
0,TRAJ1*01,YESITSQLQFGKGTRVSTSPM
1,TRAJ10*01,ILTGGGNKLTFGTGTQLKVELN
2,TRAJ11*01,NSGYSTLTFGKGTMLLVSPD
3,TRAJ12*01,MDSSYKLIFGSGTRLLVRPD
4,TRAJ13*01,NSGGYQKVTFGIGTKLQVIPN



Renamed BJref:


Unnamed: 0,TRBJ,TRBJseq
0,TRBJ1-1*01,NTEAFFGQGTRLTVV
1,TRBJ1-2*01,NYGYTFGSGTRLTVV
2,TRBJ1-3*01,SGNTIYFGEGSWLTVV
3,TRBJ1-4*01,TNEKLFFGSGTQLSVL
4,TRBJ1-5*01,SNQPQHFGDGTRLSIL



Joined MS_tcrs_expanded with references:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN



MS_tcrs_expanded after dropping unnecessary columns:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN



Trimmed V gene segments:


Unnamed: 0,0
0,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...
1,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
2,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
3,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...


Unnamed: 0,0
0,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...
1,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...
2,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
3,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
4,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...



Last 3 amino acids of the CDR3 sequences:


0    QYF
1    QFF
2    QFF
3    QFF
4    QYF
Name: CDR3b, dtype: object

0    MRF
1    LTF
2    LQF
3    LQF
4    IIF
Name: CDR3a, dtype: object


Split TCRB J and TRAJ sequences:


0    GPGTRLTVL
1    GPGTRLTVL
2    GPGTRLTVL
3    GPGTRLTVL
4    GPGTRLTVL
dtype: object

0    GAGTRLTVKPN
1    GTGTQLKVELN
2               
3    GAGTQVVVTPD
4    GKGTRLHILPN
dtype: object


Sequences needing manual editing if any:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
10,10,clonotype10,p1033198-5,TRBV3-1*01,TRBJ1-4*01,CASSQATGMGEKLFF,TRAV13-1*01,TRAJ24*01,CAASASFDSWGKLQF,CD4,MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNL...,TNEKLFFGSGTQLSVL,MTSIRAVFIFLWLQLDLVNGENVEQHPSTLSVQEGDSAVIKCTYSD...,TTDSWGKFEFGAGTQVVVTPD



Full sequence of alpha and beta chain:


Unnamed: 0,Bseq,Bcomplete,Aseq
0,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...
1,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...
2,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
3,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...



Final synthesis sequence:


0    MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...
1    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
2    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
3    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
4    MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...
Name: Synthesis, dtype: object

Duplicate TCRCloneIDs found!


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq,Bseq,Bcomplete,Aseq,Synthesis,TCRCloneID
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,p1001436-4_CD4_TCR_1
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_2
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_3
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_3
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,p1001436-4_CD4_TCR_4



Saved data for manual checking:


In [5]:
# Load the required libraries
import pandas as pd
import numpy as np
import os
from IPython.display import display

# Define the right substring function
def substrRight(x, n):
    return x[-n:]

# Input file names
input_excel = "adi.xlsx"
avref_csv = "AVref.txt"
bvref_csv = "BVref.txt"
ajref_csv = "AJrefv2.txt"
bjref_csv = "BJref.txt"

# Function to detect delimiter (not needed for Excel)
def detect_delimiter(file_path):
    return None  # Remove this function as delimiter detection is for CSV

# Load data using read_excel for Excel file and read_csv for CSV files
MS_tcrs = pd.read_excel(input_excel, sheet_name='adi')  # Adjust sheet_name as needed
print("Initial MS_tcrs:")
display(MS_tcrs.head())

# Display the column names to check for unexpected spaces or formatting issues
print("\nColumn names in the initial DataFrame:")
print(MS_tcrs.columns)

# Clean up column names by stripping any leading/trailing whitespace
MS_tcrs.columns = MS_tcrs.columns.str.strip()

# Verify column names after stripping whitespace
print("\nColumn names after stripping whitespace:")
print(MS_tcrs.columns)

# Ensure all necessary columns exist
required_columns = ['TRAV', 'TRBV', 'TRBJ', 'TRAJ', 'CDR3a', 'CDR3b', 'patient_id', 'subset']  # Include subset in required columns
missing_columns = [col for col in required_columns if col not in MS_tcrs.columns]

if missing_columns:
    print(f"\nError: Missing columns in the input data - {missing_columns}")
else:
    # Clean up the TCR sequence table
    MS_tcrs['TRAV'] = MS_tcrs['TRAV'].str.replace("/", "")
    MS_tcrs['TRBV'] = MS_tcrs['TRBV'] + "*01"
    MS_tcrs['TRBJ'] = MS_tcrs['TRBJ'] + "*01"
    MS_tcrs['TRAV'] = MS_tcrs['TRAV'] + "*01"

    # Update TRAJ column based on conditions
    def update_TRAJ(row):
        if row['TRAJ'] in ['TRAJ13', 'TRAJ15', 'TRAJ23', 'TRAJ24', 'TRAJ32', 'TRAJ37', 'TRAJ47']:
            return [row['TRAJ'] + "*01", row['TRAJ'] + "*02"]
        else:
            return [row['TRAJ'] + "*01"]

    # Apply the function to update TRAJ and expand rows
    MS_tcrs['TRAJ'] = MS_tcrs.apply(update_TRAJ, axis=1)

    # Duplicate rows to accommodate both versions of TRAJ
    MS_tcrs_expanded = MS_tcrs.explode('TRAJ').reset_index(drop=True)

    print("\nExpanded MS_tcrs with both TRAJ versions:")
    display(MS_tcrs_expanded.head())

    # Load TCR gene segment references from CSV files
    AVref = pd.read_csv(avref_csv, sep="\t")  # Assuming AVref.txt is tab-separated
    BVref = pd.read_csv(bvref_csv, sep="\t")  # Assuming BVref.txt is tab-separated
    AJref = pd.read_csv(ajref_csv, sep="\t")  # Assuming AJrefv2.txt is tab-separated
    BJref = pd.read_csv(bjref_csv, sep="\t")  # Assuming BJref.txt is tab-separated

    # Inspect AJref to check the columns
    print("\nAJref columns:")
    display(AJref.columns)

    # Combine columns correctly in AJref if 'X' exists
    if 'X' in AJref.columns:
        AJref['TRAJseq'] = AJref['TRAJseq'] + AJref['X']
    AJref = AJref[['AlphaChain_JGene', 'TRAJseq']]
    print("\nUpdated AJref:")
    display(AJref.head())

    # Rename columns for consistency
    AVref.rename(columns={'AlphaChain_VGene': 'TRAV'}, inplace=True)
    BVref.rename(columns={'BetaChain_VGene': 'TRBV'}, inplace=True)
    AJref.rename(columns={'AlphaChain_JGene': 'TRAJ'}, inplace=True)
    BJref.rename(columns={'BetaChain_JGene': 'TRBJ'}, inplace=True)

    print("\nRenamed AVref:")
    display(AVref.head())
    print("\nRenamed BVref:")
    display(BVref.head())
    print("\nRenamed AJref:")
    display(AJref.head())
    print("\nRenamed BJref:")
    display(BJref.head())

    # Sequences of mouse constant regions and the furin p2a linker
    MouseTRBC = "EDLRNVTPPKVSLFEPSKAEIANKQKATLVCLARGFFPDHVELSWWVNGKEVHSGVCTDPQAYKESNYSYCLSSRLRVSATFWHNPRNHFRCQVQFHGLSEEDKWPEGSPKPVTQNISAEAWGRADCGITSASYQQGVLSATILYEILLGKATLYAVLVSTLVVMAMVKRKNS"
    FurinP2A = "RRKRSGSGATNFSLLKQAGDVEENPGP"

    # Join TCR table with both the V and J sequences from the references
    MS_tcrs_expanded = MS_tcrs_expanded.merge(BVref, on="TRBV", how="left") \
                                       .merge(BJref, on="TRBJ", how="left") \
                                       .merge(AVref, on="TRAV", how="left") \
                                       .merge(AJref, on="TRAJ", how="left")
    print("\nJoined MS_tcrs_expanded with references:")
    display(MS_tcrs_expanded.head())

    # Remove unnecessary columns
    MS_tcrs_expanded = MS_tcrs_expanded.drop(columns=['X.1', 'X'], errors='ignore')
    print("\nMS_tcrs_expanded after dropping unnecessary columns:")
    display(MS_tcrs_expanded.head())

    # Trim the V gene segments
    x = MS_tcrs_expanded['TRBVseq'].str.extract(r"(.*C)")
    y = MS_tcrs_expanded['TRAVseq'].str.extract(r"(.*C)")
    print("\nTrimmed V gene segments:")
    display(x.head())
    display(y.head())

    # Extract the last 3 amino acids of the CDR3 sequences
    cdrblast3 = MS_tcrs_expanded['CDR3b'].apply(lambda s: substrRight(s, 3))
    cdralast3 = MS_tcrs_expanded['CDR3a'].apply(lambda s: substrRight(s, 3))
    print("\nLast 3 amino acids of the CDR3 sequences:")
    display(cdrblast3.head())
    display(cdralast3.head())

    # Split TCRB J sequence and take the sequence that is not the last 3 amino acids of CDR3 sequence
    xx = MS_tcrs_expanded.apply(lambda row: row['TRBJseq'].split(cdrblast3[row.name])[1] if len(row['TRBJseq'].split(cdrblast3[row.name])) == 2 else '', axis=1)
    yy = MS_tcrs_expanded.apply(lambda row: row['TRAJseq'].split(cdralast3[row.name])[1] if len(row['TRAJseq'].split(cdralast3[row.name])) == 2 else '', axis=1)
    print("\nSplit TCRB J and TRAJ sequences:")
    display(xx.head())
    display(yy.head())

    # Verify the splits
    b2 = MS_tcrs_expanded.apply(lambda row: len(row['TRBJseq'].split(cdrblast3[row.name])) != 2, axis=1)
    a2 = MS_tcrs_expanded.apply(lambda row: len(row['TRAJseq'].split(cdralast3[row.name])) != 2, axis=1)
    short = (a2 | b2)
    look = MS_tcrs_expanded[short]
    print("\nSequences needing manual editing if any:")
    display(look)

    # Assemble the full sequence of the alpha and beta chain
    MS_tcrs_expanded['Bseq'] = x[0].str[:-1] + MS_tcrs_expanded['CDR3b'] + xx
    MS_tcrs_expanded['Bcomplete'] = MS_tcrs_expanded['Bseq'] + MouseTRBC
    MS_tcrs_expanded['Aseq'] = y[0].str[:-1] + MS_tcrs_expanded['CDR3a'] + yy
    print("\nFull sequence of alpha and beta chain:")
    display(MS_tcrs_expanded[['Bseq', 'Bcomplete', 'Aseq']].head())

    # Generate final synthesis sequence
    MS_tcrs_expanded['Synthesis'] = MS_tcrs_expanded['Bcomplete'] + FurinP2A + MS_tcrs_expanded['Aseq']
    print("\nFinal synthesis sequence:")
    display(MS_tcrs_expanded['Synthesis'].head())

    # Generate TCRCloneID information and check for duplicates
    MS_tcrs_expanded['TCRCloneID'] = MS_tcrs_expanded['patient_id'] + "_" + MS_tcrs_expanded['subset']+"_TCR_" + MS_tcrs_expanded['clonotype_id'].str.extract(r"(\d+)")[0]

    # Add a column to indicate duplicate TCRCloneID
    MS_tcrs_expanded['IsDuplicate'] = MS_tcrs_expanded['TCRCloneID'].duplicated()

    if MS_tcrs_expanded['IsDuplicate'].any():
        print("Duplicate TCRCloneIDs found!")
    else:
        print("No duplicate TCRCloneIDs found.")

    # Create output file name based on input file name
    output_file = os.path.splitext(input_excel)[0] + "_assembled_TCRs_for_synthesis.xlsx"

    # Save the list for manual checking
    MS_tcrs_expanded.to_excel(output_file, index=False)
    display(MS_tcrs_expanded.head())

    # Display the saved data for verification
    print("\nSaved data for manual checking:")


Initial MS_tcrs:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset
0,1,clonotype1,p1001436-4,TRBV6-2,TRBJ2-3,CASSYWAGRPTDTQYF,TRAV6,TRAJ43,CAQAYNNNDMRF,CD4
1,2,clonotype2,p1001436-4,TRBV20-1,TRBJ2-1,CSARAMGSGVYNEQFF,TRAV23DV6,TRAJ10,CAASWDTGGGNKLTF,CD4
2,3,clonotype3,p1001436-4,TRBV20-1,TRBJ2-1,CSARAMGSGVYNEQFF,TRAV25,TRAJ24,CPSWGKLQF,CD4
3,4,clonotype4,p1001436-4,TRBV4-2,TRBJ2-3,CASSQERAGGSTDTQYF,TRAV6,TRAJ30,CALNRDDKIIF,CD4
4,5,clonotype5,p1035804-1,TRBV5-1,TRBJ2-7,CASKGLAGEYYEQYF,TRAV36DV7,TRAJ53,CAPIGGSNYKLTF,CD4



Column names in the initial DataFrame:
Index(['rank', 'clonotype_id', 'patient_id', 'TRBV', 'TRBJ', 'CDR3b', 'TRAV',
       'TRAJ', 'CDR3a', 'subset'],
      dtype='object')

Column names after stripping whitespace:
Index(['rank', 'clonotype_id', 'patient_id', 'TRBV', 'TRBJ', 'CDR3b', 'TRAV',
       'TRAJ', 'CDR3a', 'subset'],
      dtype='object')

Expanded MS_tcrs with both TRAJ versions:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4



AJref columns:


Index(['AlphaChain_JGene', 'TRAJseq', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')


Updated AJref:


Unnamed: 0,AlphaChain_JGene,TRAJseq
0,TRAJ1*01,YESITSQLQFGKGTRVSTSPM
1,TRAJ10*01,ILTGGGNKLTFGTGTQLKVELN
2,TRAJ11*01,NSGYSTLTFGKGTMLLVSPD
3,TRAJ12*01,MDSSYKLIFGSGTRLLVRPD
4,TRAJ13*01,NSGGYQKVTFGIGTKLQVIPN



Renamed AVref:


Unnamed: 0,TRAV,TRAVseq
0,TRAV1-1*01,MWGAFLLYVSMKMGGTAGQSLEQPSEVTAVEGAIVQINCTYQTSGF...
1,TRAV1-2*01,MWGVFLLYVSMKMGGTTGQNIDQPTEMTATEGAIVQINCTYQTSGF...
2,TRAV10*01,MKKHLTTFLVILWLYFYRGNGKNQVEQSPQSLIILEGKNCTLQCNY...
3,TRAV11*01,TEKPLGVSFLISSWQLCWVNRLHTLEQSPSFLNIQEGMHAVLNCTY...
4,TRAV12-1*01,MISLRVLLVILWLQLSWVWSQRKEVEQDPGPFNVPEGATVAFNCTY...



Renamed BVref:


Unnamed: 0,TRBV,TRBVseq
0,TRBV1*01,MG*SLHCGVVHCLRLHGYWNYPDTKIPGHSNGE*KDNET*ASGT*F...
1,TRBV10-1*01,MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTW...
2,TRBV10-1*02,MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTW...
3,TRBV10-2*01,MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTW...
4,TRBV10-3*01,MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTE...



Renamed AJref:


Unnamed: 0,TRAJ,TRAJseq
0,TRAJ1*01,YESITSQLQFGKGTRVSTSPM
1,TRAJ10*01,ILTGGGNKLTFGTGTQLKVELN
2,TRAJ11*01,NSGYSTLTFGKGTMLLVSPD
3,TRAJ12*01,MDSSYKLIFGSGTRLLVRPD
4,TRAJ13*01,NSGGYQKVTFGIGTKLQVIPN



Renamed BJref:


Unnamed: 0,TRBJ,TRBJseq
0,TRBJ1-1*01,NTEAFFGQGTRLTVV
1,TRBJ1-2*01,NYGYTFGSGTRLTVV
2,TRBJ1-3*01,SGNTIYFGEGSWLTVV
3,TRBJ1-4*01,TNEKLFFGSGTQLSVL
4,TRBJ1-5*01,SNQPQHFGDGTRLSIL



Joined MS_tcrs_expanded with references:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN



MS_tcrs_expanded after dropping unnecessary columns:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN



Trimmed V gene segments:


Unnamed: 0,0
0,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...
1,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
2,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
3,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...


Unnamed: 0,0
0,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...
1,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...
2,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
3,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
4,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...



Last 3 amino acids of the CDR3 sequences:


0    QYF
1    QFF
2    QFF
3    QFF
4    QYF
Name: CDR3b, dtype: object

0    MRF
1    LTF
2    LQF
3    LQF
4    IIF
Name: CDR3a, dtype: object


Split TCRB J and TRAJ sequences:


0    GPGTRLTVL
1    GPGTRLTVL
2    GPGTRLTVL
3    GPGTRLTVL
4    GPGTRLTVL
dtype: object

0    GAGTRLTVKPN
1    GTGTQLKVELN
2               
3    GAGTQVVVTPD
4    GKGTRLHILPN
dtype: object


Sequences needing manual editing if any:


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD
10,10,clonotype10,p1033198-5,TRBV3-1*01,TRBJ1-4*01,CASSQATGMGEKLFF,TRAV13-1*01,TRAJ24*01,CAASASFDSWGKLQF,CD4,MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNL...,TNEKLFFGSGTQLSVL,MTSIRAVFIFLWLQLDLVNGENVEQHPSTLSVQEGDSAVIKCTYSD...,TTDSWGKFEFGAGTQVVVTPD



Full sequence of alpha and beta chain:


Unnamed: 0,Bseq,Bcomplete,Aseq
0,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...
1,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...
2,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
3,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...
4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...



Final synthesis sequence:


0    MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...
1    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
2    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
3    MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...
4    MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...
Name: Synthesis, dtype: object

Duplicate TCRCloneIDs found!


Unnamed: 0,rank,clonotype_id,patient_id,TRBV,TRBJ,CDR3b,TRAV,TRAJ,CDR3a,subset,TRBVseq,TRBJseq,TRAVseq,TRAJseq,Bseq,Bcomplete,Aseq,Synthesis,TCRCloneID,IsDuplicate
0,1,clonotype1,p1001436-4,TRBV6-2*01,TRBJ2-3*01,CASSYWAGRPTDTQYF,TRAV6*01,TRAJ43*01,CAQAYNNNDMRF,CD4,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NNNDMRFGAGTRLTVKPN,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDM...,p1001436-4_CD4_TCR_1,False
1,2,clonotype2,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV23DV6*01,TRAJ10*01,CAASWDTGGGNKLTF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,ILTGGGNKLTFGTGTQLKVELN,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_2,False
2,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*01,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKFEFGAGTQVVVTPD,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_3,False
3,3,clonotype3,p1001436-4,TRBV20-1*01,TRBJ2-1*01,CSARAMGSGVYNEQFF,TRAV25*01,TRAJ24*02,CPSWGKLQF,CD4,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,SYNEQFFGPGTRLTVL,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,TTDSWGKLQFGAGTQVVVTPD,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTT...,MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKI...,p1001436-4_CD4_TCR_3,True
4,4,clonotype4,p1001436-4,TRBV4-2*01,TRBJ2-3*01,CASSQERAGGSTDTQYF,TRAV6*01,TRAJ30*01,CALNRDDKIIF,CD4,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,STDTQYFGPGTRLTVL,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,NRDDKIIFGKGTRLHILPN,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHL...,p1001436-4_CD4_TCR_4,False



Saved data for manual checking:
