In [2]:
import pip

def import_or_install(package):
    try:
        __import__(package)
    except ImportError:
        pip.main(['install', package]) 

In [21]:
packages=['pandas', 'sqlalchemy', 'numpy', 'psycopg2-binary']

In [22]:
for package in packages:
    import_or_install(package)

In [5]:
import pandas as pd
from sqlalchemy import create_engine

Amino Acid Composition
This function calculates the percentage composition of each amino acid in a protein sequence.

In [7]:
def amino_acid_composition(sequence):
    from collections import Counter
    total_len = len(sequence)
    aa_count = Counter(sequence)
    composition = {aa: (count / total_len) * 100 for aa, count in aa_count.items()}
    return composition

Hydrophobicity Scores
For this example, we'll use the Kyte & Doolittle scale of hydrophobicity. This function will calculate the average hydrophobicity of the sequence.

In [8]:
def calculate_hydrophobicity(sequence):
    hydrophobicity_scale = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    scores = [hydrophobicity_scale[aa] for aa in sequence if aa in hydrophobicity_scale]
    return sum(scores) / len(scores) if scores else 0

Secondary Structure Prediction Mock-up
This is a very simplistic mock model that assigns secondary structures randomly based on typical probabilities (30% helix, 20% sheet, 50% coil).

In [11]:
import numpy as np

def predict_secondary_structure(sequence):
    structures = ['Helix', 'Sheet', 'Coil']
    probabilities = [0.30, 0.20, 0.50]  # Helix, Sheet, Coil probabilities
    return np.random.choice(structures, size=len(sequence), p=probabilities)

In [12]:
# Load the TSV file
data = pd.read_csv('../../data/uniparc_AND_database_facet_500_AND_da_2024_05_28.tsv', delimiter='\t')

# Apply the functions to generate additional features
data['Amino_Acid_Composition'] = data['Sequence'].apply(amino_acid_composition)
data['Avg_Hydrophobicity'] = data['Sequence'].apply(calculate_hydrophobicity)
data['Secondary_Structure'] = data['Sequence'].apply(predict_secondary_structure)

# Convert complex types from the composition to a string format if needed
data['Amino_Acid_Composition'] = data['Amino_Acid_Composition'].apply(str)
data['Secondary_Structure'] = data['Secondary_Structure'].apply(lambda x: ','.join(x))


In [13]:
data.head()

Unnamed: 0,Entry,Organisms,Length,First seen,Last seen,Organism ID,Protein names,Sequence,Pfam,SMART,Amino_Acid_Composition,Avg_Hydrophobicity,Secondary_Structure
0,UPI00000001D7,Saccharomyces cerevisiae (strain ATCC 204508 /...,147,1988-11-01,2024-03-27,559292; 307796; 285006; 574961; 643680; 109563...,Calmodulin;Calmodulin;Calmodulin;Cmd1p;Cmd1p;C...,MSSNLTEEQIAEFKEAFALFDKDNNGSISSSELATVMRSLGLSPSE...,PF13499,SM00054,"{'M': 3.4013605442176873, 'S': 12.244897959183...",-0.294558,"Coil,Coil,Helix,Sheet,Helix,Helix,Coil,Coil,He..."
1,UPI0000000336,Saccharomyces cerevisiae (strain ATCC 204508 /...,183,1993-07-01,2024-03-27,559292; 307796; 285006; 545124; 574961; 643680...,"ARS-binding factor 2, mitochondrial;ARS-bindin...",MNSYSLLTRSFHESSKPLFNLASTLLKASKRTQLRNELIKQGPKRP...,PF00505,SM00398,"{'M': 1.092896174863388, 'N': 4.37158469945355...",-0.973224,"Coil,Coil,Coil,Coil,Coil,Helix,Coil,Coil,Coil,..."
2,UPI00000003DE,Saccharomyces cerevisiae (strain ATCC 204508 /...,203,1995-02-01,2024-03-27,559292; 559292; 307796; 285006; 545124; 574961...,Putative GPI-anchored protein YAR066W;Putative...,MFNRFNKFQAAVALALLSRGALGDSYTNSTSSADLSSITSVSSASA...,,,"{'M': 0.49261083743842365, 'F': 5.418719211822...",0.081281,"Sheet,Sheet,Sheet,Sheet,Coil,Coil,Helix,Coil,S..."
3,UPI0000000521,Saccharomyces cerevisiae (strain ATCC 204508 /...,411,1995-02-01,2024-03-27,559292; 4932; 1294345; 559292; 4932; 4932; 559...,Ceramide synthase LAG1;LAG1 isoform 1;Lag1p;sp...,MTSATDKSIDRLVVNAKTRRRNSSVGKIDLGDTVPGFAAMPESAAS...,PF03798,SM00724,"{'M': 3.1630170316301705, 'T': 5.1094890510948...",0.123114,"Sheet,Sheet,Sheet,Sheet,Coil,Helix,Coil,Sheet,..."
4,UPI000000056E,Saccharomyces cerevisiae (strain ATCC 204508 /...,1367,1995-02-01,2024-03-27,559292; 4932; 4932; 4932; 559292; 4932; 4932; ...,Flocculation protein FLO11;FLO11 isoform 1;FLO...,MQRPFLLAYLVLSLLFNSALGFPTALVPRGSSEGTSCNSIVNGCPN...,PF10182,SM01213,"{'M': 0.36576444769568395, 'Q': 1.463057790782...",-0.36774,"Coil,Coil,Sheet,Coil,Coil,Coil,Coil,Sheet,Coil..."


Refer [Database Setup](./databaseSetup.md) for steps to set up the postgres database

In [45]:
data.iloc[0]

Entry                                                         UPI00000001D7
Organisms                 Saccharomyces cerevisiae (strain ATCC 204508 /...
Length                                                                  147
First seen                                                       1988-11-01
Last seen                                                        2024-03-27
Organism ID               559292; 307796; 285006; 574961; 643680; 109563...
Protein names             Calmodulin;Calmodulin;Calmodulin;Cmd1p;Cmd1p;C...
Sequence                  MSSNLTEEQIAEFKEAFALFDKDNNGSISSSELATVMRSLGLSPSE...
Pfam                                                                PF13499
SMART                                                               SM00054
Amino_Acid_Composition    {'M': 3.4013605442176873, 'S': 12.244897959183...
Avg_Hydrophobicity                                                -0.294558
Secondary_Structure       Coil,Coil,Helix,Sheet,Helix,Helix,Coil,Coil,He...
Name: 0, dty

In [47]:
# Define a function to convert numpy data types to native Python types
def convert_types(val):
    if isinstance(val, np.integer):
        return int(val)
    elif isinstance(val, np.floating):
        return float(val)
    elif isinstance(val, np.ndarray):
        return val.tolist()  # Convert arrays to list if needed
    else:
        return val

In [48]:
# Connect to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/generatebiomedicines')

In [51]:
# sample data 
sampleDataToInsert = data.iloc[0]

# Apply the conversion to all columns in the DataFrame before inserting into SQL
sampleDataToInsert = data.map(convert_types)

In [52]:
# Insert the first row into the database
sampleDataToInsert.to_sql('protein_data', con=engine, if_exists='replace', index=False)

212

In [57]:
# Fetching data for a given entry id from the database
def fetchEntryById (entry_id):
    query = f"SELECT * FROM protein_data WHERE \"Entry\" = '{entry_id}'"

    print(query)

    # Execute the query and fetch the row into a DataFrame
    data = pd.read_sql(query, engine)
    
    # Print the fetched row
    if not data.empty:
        print("Fetched Row:")
        print(data)
    else:
        print("No data found for the specified entry.")

In [58]:
sampleEntryId = data.iloc[0]["Entry"]
sampleEntryId

'UPI00000001D7'

In [None]:
SELECT * FROM protein_data where "Entry"='UPI00000001D7'

In [59]:
fetchEntryById(sampleEntryId)

SELECT * FROM protein_data WHERE "Entry" = 'UPI00000001D7'
Fetched Row:
           Entry                                          Organisms  Length  \
0  UPI00000001D7  Saccharomyces cerevisiae (strain ATCC 204508 /...     147   

   First seen   Last seen                                        Organism ID  \
0  1988-11-01  2024-03-27  559292; 307796; 285006; 574961; 643680; 109563...   

                                       Protein names  \
0  Calmodulin;Calmodulin;Calmodulin;Cmd1p;Cmd1p;C...   

                                            Sequence     Pfam    SMART  \
0  MSSNLTEEQIAEFKEAFALFDKDNNGSISSSELATVMRSLGLSPSE...  PF13499  SM00054   

                              Amino_Acid_Composition  Avg_Hydrophobicity  \
0  {'M': 3.4013605442176873, 'S': 12.244897959183...           -0.294558   

                                 Secondary_Structure  
0  Coil,Coil,Helix,Sheet,Helix,Helix,Coil,Coil,He...  
