# Sequencing G001

In [1]:
from pathlib import Path
import glob

import pandas as pd
from Bio.Seq import Seq
from Bio import SeqIO

import my_functions as fun
pd.set_option('display.max_columns', 25)

## Load SNP overview

In this dataframe you can find which genes of the WT are mutated in G001, and e.g. how severe the mutation is.

In [2]:
# open .txt file and read text 
f = open(fun.get_file_path('P1_bowtie2_20201011.variants-final.Refseq.snpEff_genes.txt'), 'r')
content = f.read()
f.close()

# split text into lines and tabs
output = content.split('\n')
output = [n.split('\t') for n in output]

# discard first row, does not contain data
del output[0]

# create dataframe and identify first row as column names
df_genes = pd.DataFrame(output)
df_genes.columns = df_genes.iloc[0]
df_genes = df_genes.drop(0)
df_genes = df_genes[:-1]

In [3]:
df_genes


Unnamed: 0,#GeneName,GeneId,TranscriptId,BioType,variants_impact_HIGH,variants_impact_LOW,variants_impact_MODERATE,variants_impact_MODIFIER,variants_effect_conservative_inframe_deletion,variants_effect_conservative_inframe_insertion,variants_effect_disruptive_inframe_deletion,variants_effect_downstream_gene_variant,variants_effect_frameshift_variant,variants_effect_missense_variant,variants_effect_splice_region_variant,variants_effect_stop_gained,variants_effect_stop_lost,variants_effect_synonymous_variant,variants_effect_upstream_gene_variant
1,CGZ69_RS00005,CGZ69_RS00005,CGZ69_RS00005,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
2,CGZ69_RS00010,CGZ69_RS00010,CGZ69_RS00010,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,CGZ69_RS00015,CGZ69_RS00015,CGZ69_RS00015,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,CGZ69_RS00020,CGZ69_RS00020,CGZ69_RS00020,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5,CGZ69_RS00030,CGZ69_RS00030,CGZ69_RS00030,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5621,yidC,CGZ69_RS13080,CGZ69_RS13080,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5622,yidC,CGZ69_RS18120,CGZ69_RS18120,protein_coding,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0
5623,yidD,CGZ69_RS18125,CGZ69_RS18125,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5624,zapE,CGZ69_RS29290,CGZ69_RS29290,protein_coding,0,0,1,7,0,0,0,5,0,1,0,0,0,0,2


In [4]:
# modify the dataframe with all genes, to be able to search more easily
df_genes["GeneId"] = df_genes["GeneId"].str.replace("CGZ69_RS", "")
df_genes["GeneId"] = pd.to_numeric(df_genes["GeneId"])
df_genes["variants_impact_HIGH"] = pd.to_numeric(df_genes["variants_impact_HIGH"])
df_genes["variants_impact_MODERATE"] = pd.to_numeric(df_genes["variants_impact_MODERATE"])
df_genes["variants_impact_LOW"] = pd.to_numeric(df_genes["variants_impact_LOW"])
df_genes["variants_impact_MODIFIER"] = pd.to_numeric(df_genes["variants_impact_MODIFIER"])

In [5]:
df_genes.head()

Unnamed: 0,#GeneName,GeneId,TranscriptId,BioType,variants_impact_HIGH,variants_impact_LOW,variants_impact_MODERATE,variants_impact_MODIFIER,variants_effect_conservative_inframe_deletion,variants_effect_conservative_inframe_insertion,variants_effect_disruptive_inframe_deletion,variants_effect_downstream_gene_variant,variants_effect_frameshift_variant,variants_effect_missense_variant,variants_effect_splice_region_variant,variants_effect_stop_gained,variants_effect_stop_lost,variants_effect_synonymous_variant,variants_effect_upstream_gene_variant
1,CGZ69_RS00005,5,CGZ69_RS00005,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
2,CGZ69_RS00010,10,CGZ69_RS00010,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,CGZ69_RS00015,15,CGZ69_RS00015,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,CGZ69_RS00020,20,CGZ69_RS00020,protein_coding,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5,CGZ69_RS00030,30,CGZ69_RS00030,protein_coding,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


## Load SNP annotation

In this dataframe you can look up what type of mutation has occured in each gene.

In [6]:
# open the .tsv file and read into a dataframe
df_ann = pd.read_csv(fun.get_file_path("P1_bowtie2_20201011.variants-final.Refseq.ann.tsv"), sep="\t")
df_ann = df_ann.drop(labels='Unnamed: 5', axis=1)  # remove last column that does not contain data

In [7]:
# create dataframe of all genes that are affected by each mutation 
df_ann_headers = df_ann['ANN'].str.split(', ', expand=True)

In [8]:
# obtain the annotation column IDs from the .vcf file. 
with open(fun.get_file_path("P1_bowtie2_20201011.variants-final.Refseq.ann.vcf")) as file:
    while line := file.readline(): 
        if line.startswith("##INFO=<ID=ANN"):  # find the row with annotation IDs
            ann_ids_raw = line
            break

ann_ids = ann_ids_raw.split("'")[1].split(' | ')  # format annotation IDs

In [9]:
# expand data of first gene that is affected by the mutation
df_ann[ann_ids] = df_ann_headers[0].str.split('|', expand=True)

In [10]:
# modify the dataframe with all genes, to be able to search more easily
df_ann["Distance"] = pd.to_numeric(df_ann["Distance"])

In [11]:
df_ann.head()

Unnamed: 0,POS,REF,ALT,QUAL,ANN,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,ERRORS / WARNINGS / INFO
0,4,AG,AGAGCGGGTAAGAAG,96.0,AGAGCGGGTAAGAAG|upstream_gene_variant|MODIFIER...,AGAGCGGGTAAGAAG,upstream_gene_variant,MODIFIER,CGZ69_RS00010,CGZ69_RS00010,transcript,CGZ69_RS00010,protein_coding,,c.-1277_-1276insAGCGGGTAAGAAG,,,,,1276.0,
1,48701,C,T,225.0,T|missense_variant|MODERATE|CGZ69_RS00225|CGZ6...,T,missense_variant,MODERATE,CGZ69_RS00225,CGZ69_RS00225,transcript,CGZ69_RS00225,protein_coding,1/1,c.371G>A,p.Arg124Lys,371/748,371/748,124/248,,WARNING_TRANSCRIPT_INCOMPLETE
2,56021,C,T,225.0,T|missense_variant|MODERATE|CGZ69_RS00245|CGZ6...,T,missense_variant,MODERATE,CGZ69_RS00245,CGZ69_RS00245,transcript,CGZ69_RS00245,protein_coding,1/1,c.1798G>A,p.Glu600Lys,1798/6354,1798/6354,600/2117,,
3,83126,G,A,225.0,A|missense_variant|MODERATE|CGZ69_RS36435|CGZ6...,A,missense_variant,MODERATE,CGZ69_RS36435,CGZ69_RS36435,transcript,CGZ69_RS36435,protein_coding,1/1,c.13G>A,p.Ala5Thr,13/153,13/153,5/50,,
4,100949,C,G,225.0,G|missense_variant|MODERATE|cas7e|CGZ69_RS0044...,G,missense_variant,MODERATE,cas7e,CGZ69_RS00445,transcript,CGZ69_RS00445,protein_coding,1/1,c.1130G>C,p.Gly377Ala,1130/1155,1130/1155,377/384,,WARNING_TRANSCRIPT_NO_START_CODON


## SCO annotations

In [12]:
# open .txt file and read text 
f = open(fun.get_file_path('NZ_CP022438_S.peucetius_subsp.caesius_ATCC_27952_chromosome_proteins_toSCO.tsv_corr_e0.001_cover60.tsv'), 'r')
content = f.read()
f.close()

# split text into lines and tabs
output = content.splitlines()
output = [n.split('\t') for n in output]

# create dataframe and identify first row as column names
df_SCO = pd.DataFrame(output)
df_SCO.columns = df_SCO.iloc[0]
df_SCO = df_SCO.drop(0)
df_SCO = df_SCO[:-1]
df_SCO[["SCO", "SCO_function"]] = df_SCO["SCO"].str.split(" ", 1, expand=True)
df_SCO[["S.peucetius", "SPE_code"]] = df_SCO["S.peucetius"].str.split(" ", 1, expand=True)

In [13]:
df_SCO

Unnamed: 0,S.peucetius,SCO,Coverage,SCO_function,SPE_code
1,CGZ69_RS00005,NotFound,0%,,
2,CGZ69_RS00010,SCO1013,95%,mut-like protein,
3,CGZ69_RS00015,NotFound,0%,,
4,CGZ69_RS00020,NotFound,0%,,
5,CGZ69_RS00035,NotFound,0%,,
...,...,...,...,...,...
6398,CGZ69_RS36200,NotFound,0%,,
6399,CGZ69_RS36205,SCO5327,99%,transposase,
6400,CGZ69_RS36210,SCO0768,106%,lipoprotein,
6401,CGZ69_RS36220,NotFound,0%,,


In [14]:
df_SCO = df_SCO.sort_values("S.peucetius")

In [15]:
df_SCO.to_csv("df_SCO.csv", index=False, sep = ';')

In [16]:
# Create a dictionary of SCO numbers
dict_SCO = {gene_code: "pseudo_gene" for gene_code in df_genes["TranscriptId"]}

for gene_code in df_SCO["S.peucetius"]:
    dict_SCO[gene_code] = df_SCO[df_SCO["S.peucetius"] == gene_code]["SCO"].iloc[0]

In [17]:
# Add SCO number to df_ann and df_genes
df_ann["SCO"] = df_ann["Gene_ID"].apply(lambda x: dict_SCO[x])
df_genes["SCO"] = df_genes["TranscriptId"].apply(lambda x: dict_SCO[x])

In [18]:
df_genes.to_csv("df_genes.csv", index=False, sep = ';')

In [19]:
df_ann.to_csv("df_ann.csv", index=False, sep = ';')

## Add AntiSMASH results

In [20]:
# Collect filenames of all gbk files with antismash clusters in a list
list_files = []

for path in Path('C:\\Users\\mandy\\Documents\\Programming\\G001_sequencing\\input\\').rglob('*region*'):
    list_files.append(path)

In [21]:
# Collect all antismash clusters in a list
list_clusters = [filename.name.replace("NZ_CP022438.1.", "").replace(".gbk", "") for filename in list_files]

In [22]:
# Extract all the genes (CDS actually) from each cluster and collect in dictionary

# initialize dictionary to store all genes for each cluster
dict_antismash = {gene_code: "" for gene_code in df_genes["TranscriptId"]}
cluster_product = "?"

# TODO: change to enumerate
for num in range(len(list_files)):
    
    # open the file with BioPython
    record = SeqIO.read(list_files[num], "genbank")

    # initialize a list to collect all genes in the cluster
    list_genes = []

    for feat_num in range(len(record.features)):
        
        feat = record.features[feat_num]
        
        if feat.type == "cand_cluster":
            cluster_product = feat.qualifiers["product"][0]
            break
                
    for feat_num in range(len(record.features)):
        
        # collect all gene names in current record in the current list
        feat = record.features[feat_num]

        if feat.type == "CDS":
            gene = feat.qualifiers["locus_tag"][0]
            list_genes.append(gene)
            dict_antismash[gene] = list_clusters[num] + ', ' + cluster_product

In [23]:
del dict_antismash['allorf_3591657_3591773']

KeyError: 'allorf_3591657_3591773'

In [None]:
df_genes["antiSMASH"] = df_genes["TranscriptId"].apply(lambda x: dict_antismash[x])

## Check what type of mutations have occured in doxorubicin cluster

This section deals with the genes in the doxorubicin cluster. Which genes are affected and with what kind of mutations? Besides that the genes are annotated with their gene code from a txt file.

### Select only the genes in the DXR cluster

In [25]:
# select only the genes of the doxorubicin cluster
df_DXR = df_genes.loc[(df_genes["GeneId"] > 24485) & (df_genes["GeneId"] < 24680)]

In [26]:
# For each gene, find which mutations affect it and collect in a dataframe
df_old = df_ann[df_ann['ANN'].str.contains('CGZ69_RS24500')]

for gene_id in df_DXR["TranscriptId"]:
    df_new = df_ann[df_ann['Gene_ID'].str.contains(gene_id)]
    df_ann_DXR = pd.concat([df_old, df_new])
    df_old = df_ann_DXR

In [27]:
# Remove duplicates and remove mutations outside genes: modifiers
df_ann_DXR = df_ann_DXR.sort_index().drop_duplicates()

### Add gene names to DXR genes

In [28]:
# Extract gene names from txt file and create dictionary
with open(fun.get_file_path("DXR_cluster_ANN.txt"), 'r') as file:
    x = file.read().splitlines()

dict_DXR_annotation = {}

for line in x:
    row = line.split("\t")
    dict_DXR_annotation[row[1]] = row[0]

In [29]:
# Make a list of genes ordered like the dataframe and replace current gene names
genes_ordered = []

for geneID in df_ann_DXR["Gene_ID"]:
    genes_ordered.append(dict_DXR_annotation[geneID])

df_ann_DXR["Gene_Name"] = genes_ordered

In [30]:
# Extract usefull data from dataframe
df_ann_DXR = df_ann_DXR[["Gene_ID", "Gene_Name", "Annotation", "Annotation_Impact", "HGVS.c", "CDS.pos / CDS.length", "HGVS.p", "AA.pos / AA.length"]]

### Find interesting genes in DXR cluster

In [31]:
df_DXR_coding = df_DXR.loc[(df_genes["variants_impact_HIGH"] > 0) | (df_genes["variants_impact_MODERATE"] > 0) | (df_genes["variants_impact_LOW"] > 0)]
df_DXR_coding_high_mod = df_DXR.loc[(df_genes["variants_impact_HIGH"] > 0) | (df_genes["variants_impact_MODERATE"] > 0)]

df_ann_DXR_coding = df_ann_DXR[df_ann_DXR["Annotation_Impact"] != "MODIFIER"]
df_ann_DXR_promoter = df_ann_DXR.loc[(df_ann_DXR["Annotation"] == "upstream_gene_variant") & (df_ann["Distance"] < 300)]

print("Total number of genes in the DXR cluster:", len(df_DXR))
print("Genes in the DXR cluster with a mutation in coding sequence:", len(df_DXR_coding))
print("Genes in the DXR cluster with a high or moderate mutation in coding sequence:", len(df_DXR_coding_high_mod))

print("Total number of mutations in the DXR cluster:", len(df_ann_DXR))
print("Mutations in the DXR cluster within a coding sequence:", len(df_ann_DXR_coding))
print("Mutations in the DXR cluster in a promoter sequence (300 nt upstream):", len(df_ann_DXR_promoter))

Total number of genes in the DXR cluster: 38
Genes in the DXR cluster with a mutation in coding sequence: 9
Genes in the DXR cluster with a high or moderate mutation in coding sequence: 5
Total number of mutations in the DXR cluster: 15
Mutations in the DXR cluster within a coding sequence: 11
Mutations in the DXR cluster in a promoter sequence (300 nt upstream): 0


In [32]:
# Genes in the DXR cluster with a high or moderate mutation in coding sequence
df_ann_DXR[(df_ann_DXR["Annotation_Impact"] == "HIGH") | (df_ann_DXR["Annotation_Impact"] == "MODERATE")]

Unnamed: 0,Gene_ID,Gene_Name,Annotation,Annotation_Impact,HGVS.c,CDS.pos / CDS.length,HGVS.p,AA.pos / AA.length
1478,CGZ69_RS24490,drrC,missense_variant,MODERATE,c.1631C>T,1631/2295,p.Pro544Leu,544/764
1479,CGZ69_RS24490,drrC,missense_variant,MODERATE,c.1670C>T,1670/2295,p.Thr557Ile,557/764
1480,CGZ69_RS24500,dnrS,missense_variant,MODERATE,c.67G>A,67/1296,p.Ala23Thr,23/431
1481,CGZ69_RS24515,dnrK,missense_variant,MODERATE,c.19G>A,19/1071,p.Val7Ile,7/356
1487,CGZ69_RS24570,dnrH,missense_variant,MODERATE,c.1100G>A,1100/1332,p.Gly367Asp,367/443
1491,CGZ69_RS24645,drrD,frameshift_variant,HIGH,c.1068delG,1068/1191,p.Gly358fs,356/396


In [33]:
# Genes in the DXR cluster oustide the coding sequence
df_ann_DXR[df_ann_DXR["Annotation_Impact"] == "MODIFIER"]

Unnamed: 0,Gene_ID,Gene_Name,Annotation,Annotation_Impact,HGVS.c,CDS.pos / CDS.length,HGVS.p,AA.pos / AA.length
1484,CGZ69_RS24535,dpsC,upstream_gene_variant,MODIFIER,c.-4950T>G,,,
1485,CGZ69_RS24535,dpsC,upstream_gene_variant,MODIFIER,c.-4954T>G,,,
1489,CGZ69_RS24565,dnrE,upstream_gene_variant,MODIFIER,c.-3908G>A,,,
1492,CGZ69_RS24650,drrB,upstream_gene_variant,MODIFIER,c.-4128_-4118delCCCTCCCGGTA,,,


## Check mutations in other genes

### How many genes are affected and by what type of mutations?

In [34]:
# Which genes are affected by the mutations?
total_genes = 7129

df_genes_affected = df_genes.loc[(df_genes["variants_impact_HIGH"] > 0) | 
                                 (df_genes["variants_impact_MODERATE"] > 0) | 
                                 (df_genes["variants_impact_LOW"] > 0) |
                                 (df_genes["variants_impact_MODIFIER"] > 0)]

df_genes_coding = df_genes[(df_genes["variants_impact_MODIFIER"] == 0)]

df_genes_surrounding = df_genes.loc[(df_genes["variants_impact_HIGH"] == 0) & 
                                    (df_genes["variants_impact_MODERATE"] == 0) & 
                                    (df_genes["variants_impact_LOW"] == 0)]

df_genes_cod_sur = df_genes.loc[((df_genes["variants_impact_HIGH"] > 0) | (df_genes["variants_impact_MODERATE"] > 0) | (df_genes["variants_impact_LOW"] > 0)) & 
                                (df_genes["variants_impact_MODIFIER"] > 0)]

print(("Total number of genes: {}").format(total_genes))
print(("Genes affected by a mutation: {} ({:.2g} % of all genes)").format(len(df_genes_coding), len(df_genes_affected)/total_genes*100))
print(("Genes with mutation inside coding region only: {} ({:.2g} % of all genes)").format(len(df_genes_coding), len(df_genes_coding)/total_genes*100))
print(("Genes with mutation outside coding region only: {} ({:.2g} % of all genes)").format(len(df_genes_surrounding), len(df_genes_surrounding)/total_genes*100))
print(("Genes with mutation inside and outside coding region: {} ({:.2g} % of all genes)").format(len(df_genes_cod_sur), len(df_genes_cod_sur)/total_genes*100))

Total number of genes: 7129
Genes affected by a mutation: 134 (79 % of all genes)
Genes with mutation inside coding region only: 134 (1.9 % of all genes)
Genes with mutation outside coding region only: 4583 (64 % of all genes)
Genes with mutation inside and outside coding region: 908 (13 % of all genes)


In [35]:
# What type of mutations have occurred?
total_mutations = 2221

df_ann_high = df_ann[df_ann["Annotation_Impact"] == "HIGH"]
df_ann_high_stop = df_ann_high[df_ann_high["Annotation"].str.contains("stop")]
df_ann_high_frameshift = df_ann_high[df_ann_high["Annotation"].str.contains("frameshift")]
df_ann_moderate = df_ann[df_ann["Annotation_Impact"] == "MODERATE"]
df_ann_moderate_disr = df_ann_moderate[df_ann_moderate["Annotation"].str.contains("disruptive")]
df_ann_moderate_cons = df_ann_moderate[df_ann_moderate["Annotation"].str.contains("conservative")]
df_ann_moderate_amino = df_ann_moderate[df_ann_moderate["Annotation"].str.contains("missense")]
df_ann_low = df_ann[df_ann["Annotation_Impact"] == "LOW"]
df_ann_promoter = df_ann[(df_ann["Annotation"] == "upstream_gene_variant") & (df_ann["Distance"] < 300)]

print(("Total number of mutations: {}").format(total_mutations))
print(("Mutations with a HIGH impact: {} ({:.2g} % of all mutations)").format(len(df_ann_high), len(df_ann_high)/total_mutations*100))
print(("   of which {} affecting stop codons and {} affecting frameshifts.").format(len(df_ann_high_stop), len(df_ann_high_frameshift)))
print(("Mutations with a MODERATE impact: {} ({:.2g} % of all mutations)").format(len(df_ann_moderate), len(df_ann_moderate)/total_mutations*100))
print(("   of which {} cause a disruptive and {} a conservative inframe deletion, and {} an amino acid change.")
      .format(len(df_ann_moderate_disr), len(df_ann_moderate_cons), len(df_ann_moderate_amino)))
print(("Mutations with a LOW impact: {} ({:.2g} % of all mutations)").format(len(df_ann_low), len(df_ann_low)/total_mutations*100))
print(("Mutations in a promoter region: {} ({:.2g} % of all mutations)").format(len(df_ann_promoter), len(df_ann_promoter)/total_mutations*100))

Total number of mutations: 2221
Mutations with a HIGH impact: 370 (17 % of all mutations)
   of which 25 affecting stop codons and 347 affecting frameshifts.
Mutations with a MODERATE impact: 859 (39 % of all mutations)
   of which 2 cause a disruptive and 3 a conservative inframe deletion, and 854 an amino acid change.
Mutations with a LOW impact: 432 (19 % of all mutations)
Mutations in a promoter region: 50 (2.3 % of all mutations)


In [36]:
# Which mutations are interesting?
df_ann_interesting = df_ann[(df_ann["Annotation_Impact"] == "HIGH") | 
                            (df_ann["Annotation_Impact"] == "MODERATE") | 
                            ((df_ann["Annotation_Impact"] == "MODIFIER") & (df_ann["Distance"] < 300))]

In [37]:
# Which interesting mutations have a gene name?
gene_int_list = df_ann_interesting[~df_ann_interesting["Gene_Name"].str.contains("CGZ")]["Gene_Name"].unique()

In [38]:
# Add a column to df_genes with the genes that have a mutation in the promoter region.
df_genes["variants_impact_promoter"] = 0

for gene_code in df_ann_promoter["Gene_ID"]:
    df_genes.loc[df_genes['TranscriptId'] == gene_code,"variants_impact_promoter"] += 1 


In [39]:
df_genes.loc[df_genes['#GeneName'].str.contains('CGZ69'), '#GeneName'] = ''

In [40]:
# Which genes have interesting mutations?
df_genes_interesting = df_genes[(df_genes["variants_impact_HIGH"] > 0) | 
                                (df_genes["variants_impact_MODERATE"] > 0) | 
                                (df_genes["variants_impact_promoter"] > 0)]

In [41]:
df_genes_interesting.head()

Unnamed: 0,#GeneName,GeneId,TranscriptId,BioType,variants_impact_HIGH,variants_impact_LOW,variants_impact_MODERATE,variants_impact_MODIFIER,variants_effect_conservative_inframe_deletion,variants_effect_conservative_inframe_insertion,variants_effect_disruptive_inframe_deletion,variants_effect_downstream_gene_variant,variants_effect_frameshift_variant,variants_effect_missense_variant,variants_effect_splice_region_variant,variants_effect_stop_gained,variants_effect_stop_lost,variants_effect_synonymous_variant,variants_effect_upstream_gene_variant,SCO,antiSMASH,variants_impact_promoter
12,,225,CGZ69_RS00225,protein_coding,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,pseudo_gene,,0
16,,245,CGZ69_RS00245,protein_coding,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,SCO6220,,0
36,,470,CGZ69_RS00470,protein_coding,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,NotFound,,0
40,,490,CGZ69_RS00490,protein_coding,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,SCO6219,,0
47,,535,CGZ69_RS00535,protein_coding,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,NotFound,,0


In [42]:
df_SCO_out = df_genes_interesting[["TranscriptId", "#GeneName", "SCO", "variants_impact_HIGH", "variants_impact_MODERATE", "variants_impact_promoter", "antiSMASH"]]

In [43]:
df_SCO_out[(df_SCO_out["SCO"] != "pseudo_gene") & (df_SCO_out["SCO"] != "NotFound")]

Unnamed: 0,TranscriptId,#GeneName,SCO,variants_impact_HIGH,variants_impact_MODERATE,variants_impact_promoter,antiSMASH
16,CGZ69_RS00245,,SCO6220,0,1,0,
40,CGZ69_RS00490,,SCO6219,0,1,0,
80,CGZ69_RS00805,,SCO1569,0,0,1,
141,CGZ69_RS01200,,SCO3444,0,1,0,
164,CGZ69_RS01340,,SCO6479,0,1,0,
...,...,...,...,...,...,...,...
5585,CGZ69_RS16085,thpR,SCO4338,0,1,0,
5605,CGZ69_RS21230,tuf,SCO4662,0,1,0,
5608,CGZ69_RS16535,ugpC,SCO4240,0,2,0,
5614,CGZ69_RS08535,uvrB,SCO1966,0,1,0,


In [44]:
df_SCO_out.to_csv("df_SCO_out.csv", index=False, sep = ';')

## Check one specific gene

In [45]:
# Check mutations in one specific gene
this_gene = "CGZ69_RS24515"
df_ann_this_gene = df_ann[df_ann['ANN'].str.contains(this_gene)]

In [46]:
df_ann_this_gene

Unnamed: 0,POS,REF,ALT,QUAL,ANN,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,ERRORS / WARNINGS / INFO,SCO
1480,5283337,C,T,225.0,T|missense_variant|MODERATE|CGZ69_RS24500|CGZ6...,T,missense_variant,MODERATE,CGZ69_RS24500,CGZ69_RS24500,transcript,CGZ69_RS24500,protein_coding,1/1,c.67G>A,p.Ala23Thr,67/1296,67/1296,23/431,,,NotFound
1481,5286729,C,T,225.0,T|missense_variant|MODERATE|CGZ69_RS24515|CGZ6...,T,missense_variant,MODERATE,CGZ69_RS24515,CGZ69_RS24515,transcript,CGZ69_RS24515,protein_coding,1/1,c.19G>A,p.Val7Ile,19/1071,19/1071,7/356,,,SCO7452
1482,5288942,C,T,225.0,T|synonymous_variant|LOW|CGZ69_RS24530|CGZ69_R...,T,synonymous_variant,LOW,CGZ69_RS24530,CGZ69_RS24530,transcript,CGZ69_RS24530,protein_coding,1/1,c.327G>A,p.Ala109Ala,327/1017,327/1017,109/338,,,SCO6275
