<a href="https://colab.research.google.com/github/molecools/molecular-docking/blob/main/geneseq_biopy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pandas as pd
!pip install biopython
import Bio
from Bio import SeqIO



In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/promoters.data.txt", sep='\t', header=None)

In [4]:
df


Unnamed: 0,0,1,2
0,"+,S10,",,tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...
1,"+,AMPC,",,tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...
2,"+,AROH,",,gtactagagaactagtgcattagcttatttttttgttatcatgcta...
3,"+,DEOP2,",aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...,
4,"+,LEU1_TRNA,",tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...,
...,...,...,...
101,"-, 799,",,cctcaatggcctctaaacgggtcttgaggggttttttgctgaaagg...
102,"-, 987,",,gtattctcaacaagattaaccgacagattcaatctcgtggatggac...
103,"-,1226,",,cgcgactacgatgagatgcctgagtgcttccgttactggattgtca...
104,"-, 794,",,ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctg...


In [5]:
# Assign column names
df.columns = ['gene_code', 'sequence_1', 'sequence_2']

# Display the first few rows of the dataset
print(df.head())

      gene_code                                         sequence_1  \
0        +,S10,                                                NaN   
1       +,AMPC,                                                NaN   
2       +,AROH,                                                NaN   
3      +,DEOP2,  aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...   
4  +,LEU1_TRNA,  tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...   

                                          sequence_2  
0  tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...  
1  tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...  
2  gtactagagaactagtgcattagcttatttttttgttatcatgcta...  
3                                                NaN  
4                                                NaN  


In [6]:
# Fill NaN values in sequence columns with "Z"
df['sequence_1'] = df['sequence_1'].fillna('Z')
df['sequence_2'] = df['sequence_2'].fillna('Z')



In [7]:
df

Unnamed: 0,gene_code,sequence_1,sequence_2
0,"+,S10,",Z,tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...
1,"+,AMPC,",Z,tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...
2,"+,AROH,",Z,gtactagagaactagtgcattagcttatttttttgttatcatgcta...
3,"+,DEOP2,",aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...,Z
4,"+,LEU1_TRNA,",tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...,Z
...,...,...,...
101,"-, 799,",Z,cctcaatggcctctaaacgggtcttgaggggttttttgctgaaagg...
102,"-, 987,",Z,gtattctcaacaagattaaccgacagattcaatctcgtggatggac...
103,"-,1226,",Z,cgcgactacgatgagatgcctgagtgcttccgttactggattgtca...
104,"-, 794,",Z,ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctg...


In [8]:
# Ensure sequences are strings
df['sequence_1'] = df['sequence_1'].astype(str)
df['sequence_2'] = df['sequence_2'].astype(str)

In [11]:
from Bio.Seq import Seq  # Import the Seq class from Biopython

# Define function to calculate GC content
def calculate_gc_content(sequence):
    seq = Seq(sequence)
    gc_content = (seq.count("G") + seq.count("C")) / len(seq) * 100
    return round(gc_content, 2)  # Round to 2 decimal places

In [12]:
# Calculate GC content for both sequence columns
df['GC_content_1'] = df['sequence_1'].apply(calculate_gc_content)
df['GC_content_2'] = df['sequence_2'].apply(calculate_gc_content)

In [13]:
# Calculate sequence lengths
df['length_1'] = df['sequence_1'].apply(len)
df['length_2'] = df['sequence_2'].apply(len)

In [14]:
# Display the final DataFrame
print("\nFinal DataFrame with GC Content and Lengths:")
print(df.head())


Final DataFrame with GC Content and Lengths:
      gene_code                                         sequence_1  \
0        +,S10,                                                  Z   
1       +,AMPC,                                                  Z   
2       +,AROH,                                                  Z   
3      +,DEOP2,  aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...   
4  +,LEU1_TRNA,  tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...   

                                          sequence_2  GC_content_1  \
0  tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...           0.0   
1  tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...           0.0   
2  gtactagagaactagtgcattagcttatttttttgttatcatgcta...           0.0   
3                                                  Z           0.0   
4                                                  Z           0.0   

   GC_content_2  length_1  length_2  
0           0.0         1        57  
1           0.0         1        57 

In [15]:
df

Unnamed: 0,gene_code,sequence_1,sequence_2,GC_content_1,GC_content_2,length_1,length_2
0,"+,S10,",Z,tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...,0.0,0.0,1,57
1,"+,AMPC,",Z,tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...,0.0,0.0,1,57
2,"+,AROH,",Z,gtactagagaactagtgcattagcttatttttttgttatcatgcta...,0.0,0.0,1,57
3,"+,DEOP2,",aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...,Z,0.0,0.0,57,1
4,"+,LEU1_TRNA,",tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...,Z,0.0,0.0,57,1
...,...,...,...,...,...,...,...
101,"-, 799,",Z,cctcaatggcctctaaacgggtcttgaggggttttttgctgaaagg...,0.0,0.0,1,57
102,"-, 987,",Z,gtattctcaacaagattaaccgacagattcaatctcgtggatggac...,0.0,0.0,1,57
103,"-,1226,",Z,cgcgactacgatgagatgcctgagtgcttccgttactggattgtca...,0.0,0.0,1,57
104,"-, 794,",Z,ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctg...,0.0,0.0,1,57


In [16]:
# Ensure sequences are strings and convert to uppercase
df['sequence_1'] = df['sequence_1'].astype(str).str.upper()
df['sequence_2'] = df['sequence_2'].astype(str).str.upper()

In [17]:
df

Unnamed: 0,gene_code,sequence_1,sequence_2,GC_content_1,GC_content_2,length_1,length_2
0,"+,S10,",Z,TACTAGCAATACGCTTGCGTTCGGTGGTTAAGTATGTATAATGCGC...,0.0,0.0,1,57
1,"+,AMPC,",Z,TGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAA...,0.0,0.0,1,57
2,"+,AROH,",Z,GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTA...,0.0,0.0,1,57
3,"+,DEOP2,",AATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA...,Z,0.0,0.0,57,1
4,"+,LEU1_TRNA,",TCGATAATTAACTATTGACGAAAAGCTGAAAACCACTAGAATGCGC...,Z,0.0,0.0,57,1
...,...,...,...,...,...,...,...
101,"-, 799,",Z,CCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTGAAAGG...,0.0,0.0,1,57
102,"-, 987,",Z,GTATTCTCAACAAGATTAACCGACAGATTCAATCTCGTGGATGGAC...,0.0,0.0,1,57
103,"-,1226,",Z,CGCGACTACGATGAGATGCCTGAGTGCTTCCGTTACTGGATTGTCA...,0.0,0.0,1,57
104,"-, 794,",Z,CTCGTCCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTG...,0.0,0.0,1,57


In [18]:
# Define function to calculate GC content
def calculate_gc_content(sequence):
    # Filter out invalid characters (e.g., "Z") and calculate GC content
    valid_seq = ''.join([c for c in sequence if c in 'ACGT'])
    if not valid_seq:  # If no valid characters, return 0
        return 0.0
    seq = Seq(valid_seq)
    gc_content = (seq.count("G") + seq.count("C")) / len(seq) * 100
    return round(gc_content, 2)  # Round to 2 decimal places

In [19]:
# Calculate GC content for both sequence columns
df['GC_content_1'] = df['sequence_1'].apply(calculate_gc_content)
df['GC_content_2'] = df['sequence_2'].apply(calculate_gc_content)

# Calculate sequence lengths
df['length_1'] = df['sequence_1'].apply(len)

In [20]:
df

Unnamed: 0,gene_code,sequence_1,sequence_2,GC_content_1,GC_content_2,length_1,length_2
0,"+,S10,",Z,TACTAGCAATACGCTTGCGTTCGGTGGTTAAGTATGTATAATGCGC...,0.00,47.37,1,57
1,"+,AMPC,",Z,TGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAA...,0.00,47.37,1,57
2,"+,AROH,",Z,GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTA...,0.00,40.35,1,57
3,"+,DEOP2,",AATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA...,Z,36.84,0.00,57,1
4,"+,LEU1_TRNA,",TCGATAATTAACTATTGACGAAAAGCTGAAAACCACTAGAATGCGC...,Z,42.11,0.00,57,1
...,...,...,...,...,...,...,...
101,"-, 799,",Z,CCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTGAAAGG...,0.00,45.61,1,57
102,"-, 987,",Z,GTATTCTCAACAAGATTAACCGACAGATTCAATCTCGTGGATGGAC...,0.00,40.35,1,57
103,"-,1226,",Z,CGCGACTACGATGAGATGCCTGAGTGCTTCCGTTACTGGATTGTCA...,0.00,54.39,1,57
104,"-, 794,",Z,CTCGTCCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTG...,0.00,50.88,1,57


In [21]:
df['length_2'] = df['sequence_2'].apply(len)

# Display the final DataFrame
print("\nFinal DataFrame with GC Content and Lengths:")
print(df.head())


Final DataFrame with GC Content and Lengths:
      gene_code                                         sequence_1  \
0        +,S10,                                                  Z   
1       +,AMPC,                                                  Z   
2       +,AROH,                                                  Z   
3      +,DEOP2,  AATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA...   
4  +,LEU1_TRNA,  TCGATAATTAACTATTGACGAAAAGCTGAAAACCACTAGAATGCGC...   

                                          sequence_2  GC_content_1  \
0  TACTAGCAATACGCTTGCGTTCGGTGGTTAAGTATGTATAATGCGC...          0.00   
1  TGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAA...          0.00   
2  GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTA...          0.00   
3                                                  Z         36.84   
4                                                  Z         42.11   

   GC_content_2  length_1  length_2  
0         47.37         1        57  
1         47.37         1        57 

In [22]:
df

Unnamed: 0,gene_code,sequence_1,sequence_2,GC_content_1,GC_content_2,length_1,length_2
0,"+,S10,",Z,TACTAGCAATACGCTTGCGTTCGGTGGTTAAGTATGTATAATGCGC...,0.00,47.37,1,57
1,"+,AMPC,",Z,TGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAA...,0.00,47.37,1,57
2,"+,AROH,",Z,GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTA...,0.00,40.35,1,57
3,"+,DEOP2,",AATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA...,Z,36.84,0.00,57,1
4,"+,LEU1_TRNA,",TCGATAATTAACTATTGACGAAAAGCTGAAAACCACTAGAATGCGC...,Z,42.11,0.00,57,1
...,...,...,...,...,...,...,...
101,"-, 799,",Z,CCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTGAAAGG...,0.00,45.61,1,57
102,"-, 987,",Z,GTATTCTCAACAAGATTAACCGACAGATTCAATCTCGTGGATGGAC...,0.00,40.35,1,57
103,"-,1226,",Z,CGCGACTACGATGAGATGCCTGAGTGCTTCCGTTACTGGATTGTCA...,0.00,54.39,1,57
104,"-, 794,",Z,CTCGTCCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTG...,0.00,50.88,1,57
