# GO annotation

In [1]:
import my_functions as fun
import pandas as pd

# Import annotation files

In [2]:
# headers obtained from: https://interproscan-docs.readthedocs.io/en/latest/UserDocs.html#output-formats
headers = ['Protein accession',
           'Sequence MD5 digest', 
           'Sequence length', 
           'Analysis', 
           'Signature accession', 
           'Signature description', 
           'Start location', 
           'Stop location', 
           'Score', 
           'True', 
           'Date',
           'InterPro annotations - accession', 
           'InterPro annotations - description', 
           'GO annotations',
           'to be removed']

In [3]:
# open the InterProScan .tsv file and read into a dataframe
df_WT_ips = pd.read_csv(fun.get_file_path('NZ_CP022438_S.peucetius_subsp.caesius_ATCC_27952_chromosome_proteins.ipscan.tsv'), sep="\t", header=None)
df_WT_ips.columns = headers
df_WT_ips = df_WT_ips.drop(labels='to be removed', axis=1)  # remove last column that does not contain data

In [69]:
df_WT_ips_ed = df_WT_ips[df_WT_ips["GO annotations"].notna()]

In [80]:
df_WT_ips_ed[df_WT_ips_ed["GO annotations"].str.contains("GO:0003700")]

Unnamed: 0,Protein accession,Sequence MD5 digest,Sequence length,Analysis,Signature accession,Signature description,Start location,Stop location,Score,True,Date,InterPro annotations - accession,InterPro annotations - description,GO annotations
252,CGZ69_RS29270,9ac09c4ac235a9a81be8abc318e55ca0,154,ProSiteProfiles,PS50995,MarR-type HTH domain profile.,18,148,25.348,T,07-11-2020,IPR000835,MarR-type HTH domain,GO:0003700|GO:0006355
253,CGZ69_RS29270,9ac09c4ac235a9a81be8abc318e55ca0,154,Pfam,PF01047,MarR family,45,103,1.3E-16,T,07-11-2020,IPR000835,MarR-type HTH domain,GO:0003700|GO:0006355
256,CGZ69_RS29270,9ac09c4ac235a9a81be8abc318e55ca0,154,SMART,SM00347,marrlong4,38,136,7.3E-23,T,07-11-2020,IPR000835,MarR-type HTH domain,GO:0003700|GO:0006355
535,CGZ69_RS14030,994b7143fc836024ac873b0459ff0c05,301,Pfam,PF04542,Sigma-70 region 2,80,149,1.5E-15,T,07-11-2020,IPR007627,RNA polymerase sigma-70 region 2,GO:0003700|GO:0006352|GO:0006355
536,CGZ69_RS14030,994b7143fc836024ac873b0459ff0c05,301,PRINTS,PR00046,Major sigma-70 factor signature,104,117,7.0E-12,T,07-11-2020,IPR000943,RNA polymerase sigma-70,GO:0003700|GO:0006352|GO:0006355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46974,CGZ69_RS27660,83c92fb6a4d7164e2eaad5b2c47a298c,225,PRINTS,PR00035,GntR bacterial regulatory protein HTH signature,40,56,1.1E-7,T,07-11-2020,IPR000524,"Transcription regulator HTH, GntR",GO:0003700|GO:0006355
46975,CGZ69_RS27660,83c92fb6a4d7164e2eaad5b2c47a298c,225,ProSiteProfiles,PS50949,GntR-type HTH domain profile.,2,69,19.588,T,07-11-2020,IPR000524,"Transcription regulator HTH, GntR",GO:0003700|GO:0006355
47049,CGZ69_RS22120,7976bcef5a167557cb31fd48ec153a4b,123,ProSiteProfiles,PS50995,MarR-type HTH domain profile.,1,102,20.424,T,07-11-2020,IPR000835,MarR-type HTH domain,GO:0003700|GO:0006355
47050,CGZ69_RS22120,7976bcef5a167557cb31fd48ec153a4b,123,SMART,SM00347,marrlong4,1,90,7.0E-6,T,07-11-2020,IPR000835,MarR-type HTH domain,GO:0003700|GO:0006355


In [4]:
# open the Panther .tsv file and read into a dataframe
df_WT_pthr = pd.read_csv(fun.get_file_path('NZ_CP022438_S.peucetius_subsp.caesius_ATCC_27952_chromosome_proteins.panther15.0.out.tsv'), sep="\t", header=None)
df_WT_pthr.columns = ["gene_code", "PANTHER_accession", "PANTHER_family", "HMM_Evalue-score", "HMM_bitscore", "alignment_range"]

In [5]:
df_WT_pthr.head()

Unnamed: 0,gene_code,PANTHER_accession,PANTHER_family,HMM_Evalue-score,HMM_bitscore,alignment_range
0,CGZ69_RS17810,PTHR43335,"ABC TRANSPORTER, ATP-BINDING PROTEIN",3.6e-79,264.9,1-311
1,CGZ69_RS27660,PTHR43537:SF34,PYRUVATE DEHYDROGENASE COMPLEX REPRESSOR,2.7e-41,140.4,6-201
2,CGZ69_RS25500,PTHR42891,"D-GLYCERO-BETA-D-MANNO-HEPTOSE-1,7-BISPHOSPHAT...",4e-40,135.9,6-195
3,CGZ69_RS19730,PTHR35007:SF4,INTEGRAL MEMBRANE PROTEIN-RELATED,4.2999999999999997e-57,192.1,10-282
4,CGZ69_RS15885,PTHR42964,ENOYL-COA HYDRATASE,1.1e-77,259.5,3-242


In [6]:
# How many genes have been annotated?
number = len(df_WT_pthr["gene_code"].unique())
("Panther codes identified for {} genes ({:.2g} % of all genes)").format(number, number/7129*100)

'Panther codes identified for 4932 genes (69 % of all genes)'

In [7]:
# Initialize the output dataframe for panther analysis
pthr_input = df_WT_pthr[["gene_code", "PANTHER_accession"]]
pthr_input = pthr_input.sort_values("gene_code")

In [8]:
# Initialize the output dataframe for panther analysis
pthr_input = df_WT_pthr.sort_values("gene_code")

## Import genbank file

In [9]:
import my_functions as fun
from Bio.Seq import Seq
from Bio import SeqIO

In [10]:
record = SeqIO.read(fun.get_file_path('sequence_NZ_CP0224381.gb'), "genbank")

In [11]:
# check the first few of features
gene_codes_NCBI = []

for i in range(len(record.features)):
    feat = record.features[i]
    if feat.type == "CDS":
        if "protein_id" in feat.qualifiers:
            gene_codes_NCBI.append(feat.qualifiers["locus_tag"][0])

In [12]:
len(gene_codes_NCBI)

6403

In [13]:
# Add genes that do not have a panther term to the pthr_input file
for gene_code in gene_codes_NCBI:
    if gene_code not in list(pthr_input["gene_code"]):
        pthr_input = pthr_input.append(pd.DataFrame({"gene_code": [gene_code], "PANTHER_accession": ["unclassified"]}))

In [14]:
# Initialize the output dataframe for panther analysis
pthr_input = pthr_input.sort_values("gene_code")

In [15]:
pthr_input

Unnamed: 0,gene_code,PANTHER_accession,PANTHER_family,HMM_Evalue-score,HMM_bitscore,alignment_range
0,CGZ69_RS00005,unclassified,,,,
3619,CGZ69_RS00010,PTHR43046:SF9,ADP-RIBOSE PYROPHOSPHATASE YJHB-RELATED,3.900000e-33,112.9,21-143
0,CGZ69_RS00015,unclassified,,,,
0,CGZ69_RS00020,unclassified,,,,
0,CGZ69_RS00035,unclassified,,,,
...,...,...,...,...,...,...
3878,CGZ69_RS37030,PTHR35004,TRANSPOSASE RV3428C-RELATED,1.300000e-07,29.3,36-132
4578,CGZ69_RS37035,PTHR44103,PROPROTEIN CONVERTASE P,9.300000e-24,82.1,366-595
0,CGZ69_RS37040,unclassified,,,,
4820,CGZ69_RS37045,PTHR12714:SF19,PROTEIN-S-ISOPRENYLCYSTEINE O-METHYLTRANSFERASE,1.500000e-05,23.3,7-48


# Select interesting genes

We can either use the file with all genes. However, we cannot filter out the mutations that occured further away than the promoter region. Therefore, we will use the annotated file. However, this means that the mutations are not ordered by gene. 

In [16]:
# Import files from cwd
df_genes = pd.read_csv("df_genes.csv", delimiter=";", index_col=False, low_memory=False)
df_ann = pd.read_csv("df_ann.csv", delimiter=";", index_col=False, low_memory=False)

In [17]:
def panther_count(list_genes: list):
    """This function analyses the panther terms of the list of genes provided."""
    
    counter = 0

    for gene_code in list_genes:
        
        if gene_code in list(df_WT_pthr["gene_code"]):
            counter += 0    
        
        else:
            counter += 1
    
    print("The number of genes:", len(list_genes))
    print("The number of genes that have a panther code:", len(list_genes) - counter)
    print("The number of genes that do not have a panther code:", counter) 

In [18]:
# Total genes affected by a mutation
list_total = list(df_genes["TranscriptId"])

print("Number of genes selected: {}".format(len(df_genes)))
print(f"Number of genes selected: {len(df_genes)}")

Number of genes selected: 5625
Number of genes selected: 5625


In [19]:
# How many genes have a panther code of all genes that are affected by a mutation?
panther_count(list_total)

The number of genes: 5625
The number of genes that have a panther code: 3950
The number of genes that do not have a panther code: 1675


In [20]:
# All interesting mutations (high, moderate impact or within putative promoter regions)
df_ann_interesting = df_ann[(df_ann["Annotation_Impact"] == "HIGH") | 
                            (df_ann["Annotation_Impact"] == "MODERATE") | 
                            ((df_ann["Annotation_Impact"] == "MODIFIER") & (df_ann["Distance"] < 300))]

list_interesting = list(df_ann_interesting["Gene_ID"].unique())

print("Number of mutations selected: {}".format(len(df_ann_interesting)))
print("Number of genes selected: {}".format(len(list_interesting)))

Number of mutations selected: 1279
Number of genes selected: 867


In [21]:
505/867

0.5824682814302191

In [22]:
# How many genes have a panther code of all genes are selected as interesting?
panther_count(list_interesting)

The number of genes: 867
The number of genes that have a panther code: 505
The number of genes that do not have a panther code: 362


In [23]:
pthr_input

Unnamed: 0,gene_code,PANTHER_accession,PANTHER_family,HMM_Evalue-score,HMM_bitscore,alignment_range
0,CGZ69_RS00005,unclassified,,,,
3619,CGZ69_RS00010,PTHR43046:SF9,ADP-RIBOSE PYROPHOSPHATASE YJHB-RELATED,3.900000e-33,112.9,21-143
0,CGZ69_RS00015,unclassified,,,,
0,CGZ69_RS00020,unclassified,,,,
0,CGZ69_RS00035,unclassified,,,,
...,...,...,...,...,...,...
3878,CGZ69_RS37030,PTHR35004,TRANSPOSASE RV3428C-RELATED,1.300000e-07,29.3,36-132
4578,CGZ69_RS37035,PTHR44103,PROPROTEIN CONVERTASE P,9.300000e-24,82.1,366-595
0,CGZ69_RS37040,unclassified,,,,
4820,CGZ69_RS37045,PTHR12714:SF19,PROTEIN-S-ISOPRENYLCYSTEINE O-METHYLTRANSFERASE,1.500000e-05,23.3,7-48


## Create input for panther analysis

In [24]:
# Remove duplicates with multiple accession codes from dataframe:
# Note this is absolutely not robust and really needs to be redone!!!!
df_duplicates = pthr_input[pthr_input['gene_code'].duplicated(keep=False)]
idx_duplicates = pthr_input[pthr_input['gene_code'].duplicated(keep=False)].index.values.tolist()

pthr_input = pthr_input.drop(idx_duplicates[::2]) # remove every other row.... NEEDS TO BE REDONE

In [25]:
def panther_input(list_genes):
    """This function returns a dictionary of the panther terms of the list of genes provided."""
    
    counter = 0
    dict_genes = {gene_code: 0 for gene_code in pthr_input["gene_code"]}

    for gene_code in list_genes:
        
        if gene_code in dict_genes:
            dict_genes[gene_code] = 1
        
        else:

            counter += 1
            
    return dict_genes

In [26]:
# Find the panther terms of each gene in the list and add to the pthr input dataframe
pthr_input_quan = panther_input(list_interesting)

In [27]:
pthr_input.to_csv("pthr_input_ref.txt", index=False, sep = '\t', header="none")

In [28]:
pthr_input["quantification"] = pthr_input["gene_code"].apply(lambda x: pthr_input_quan[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pthr_input["quantification"] = pthr_input["gene_code"].apply(lambda x: pthr_input_quan[x])


In [29]:
pthr_input_ana = pthr_input[pthr_input["quantification"] == 1]

In [30]:
pthr_input_ana.to_csv("pthr_input_ana.txt", index=False, sep = '\t', header="none")

In [31]:
len(pthr_input["gene_code"].unique())

6403

In [32]:
len(pthr_input_ana["gene_code"].unique())

600

In [33]:
pthr_input

Unnamed: 0,gene_code,PANTHER_accession,PANTHER_family,HMM_Evalue-score,HMM_bitscore,alignment_range,quantification
0,CGZ69_RS00005,unclassified,,,,,0
3619,CGZ69_RS00010,PTHR43046:SF9,ADP-RIBOSE PYROPHOSPHATASE YJHB-RELATED,3.900000e-33,112.9,21-143,0
0,CGZ69_RS00015,unclassified,,,,,0
0,CGZ69_RS00020,unclassified,,,,,0
0,CGZ69_RS00035,unclassified,,,,,0
...,...,...,...,...,...,...,...
3878,CGZ69_RS37030,PTHR35004,TRANSPOSASE RV3428C-RELATED,1.300000e-07,29.3,36-132,0
4578,CGZ69_RS37035,PTHR44103,PROPROTEIN CONVERTASE P,9.300000e-24,82.1,366-595,0
0,CGZ69_RS37040,unclassified,,,,,0
4820,CGZ69_RS37045,PTHR12714:SF19,PROTEIN-S-ISOPRENYLCYSTEINE O-METHYLTRANSFERASE,1.500000e-05,23.3,7-48,0


In [48]:
families = list(pthr_input["PANTHER_family"].unique())

for family in families:
    if type(family) is str:
        if "BINDING" in family:
            print(family)

HIGH-AFFINITY BRANCHED-CHAIN AMINO ACID TRANSPORT ATP-BINDING PROTEIN LIVF
BRANCHED-CHAIN AMINO ACID TRANSPORT ATP-BINDING PROTEIN-RELATED
PENICILLIN-BINDING PROTEIN 1A
ARABINOSE-BINDING PROTEIN-RELATED
DNA-BINDING PROTEIN
IRON(3+)-HYDROXAMATE IMPORT ATP-BINDING PROTEIN FHUC
PEBP (PHOSPHATIDYLETHANOLAMINE-BINDING PROTEIN) FAMILY PROTEIN
FAD-BINDING, PUTATIVE (AFU_ORTHOLOGUE AFUA_6G07600)-RELATED
ABC-TRANSPORT SYSTEM ATP BINDING PROTEIN-RELATED
D-ALLOSE-BINDING PERIPLASMIC PROTEIN-RELATED
POSSIBLE DNA-BINDING PROTEIN
TREHALOSE IMPORT ATP-BINDING PROTEIN SUGC
ABC TRANSPORTER ATP-BINDING PROTEIN SCO5958-RELATED
ALDEHYDE OXIDOREDUCTASE IRON-SULFUR-BINDING SUBUNIT PAOA
ATP-BINDING CASSETTE SUB-FAMILY B MEMBER 7, MITOCHONDRIAL
ABC TRANSPORTER GLUTAMINE-BINDING PROTEIN GLNH
ATP/GTP-BINDING PROTEIN-RELATED
NAD(P)-BINDING PROTEIN YBJT-RELATED
FE(3+) DICITRATE TRANSPORT ATP-BINDING PROTEIN FECE-RELATED
MULTIDRUG RESISTANCE-LIKE ATP-BINDING PROTEIN MDLB
FAD/NAD(P)-BINDING OXIDOREDUCTASE FAMILY PR

In [42]:
type("hoi")

str