In [2]:
!pip install streamlit
import streamlit as st
import pandas as pd
from PIL import Image
import subprocess
import os
import base64
import pickle


Collecting streamlit
  Downloading streamlit-1.20.0-py2.py3-none-any.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m546.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyarrow>=4.0
  Using cached pyarrow-11.0.0-cp39-cp39-macosx_10_14_x86_64.whl (24.5 MB)
Collecting pympler>=0.9
  Using cached Pympler-1.0.1-py3-none-any.whl (164 kB)
Collecting altair<5,>=3.2.0
  Using cached altair-4.2.2-py3-none-any.whl (813 kB)
Collecting rich>=10.11.0
  Downloading rich-13.3.3-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.7/238.7 kB[0m [31m565.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pydeck>=0.1.dev5
  Using cached pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
Collecting validators>=0.2
  Using cached validators-0.20.0-py3-none-any.whl
Collecting tzlocal>=1.1
  Downloading tzlocal-4.3-py3-none-any.whl (20 kB)
Collecting semver
  Downloading semver-3.0.0-py3-none-any.whl (17 k

In [3]:
# Molecular descriptor calculator
def desc_calc():
    # Performs the descriptor calculation
    bashCommand = "java -Xms2G -Xmx2G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv"
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    os.remove('molecule.smi')

In [4]:
# File download
def filedownload(df):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
    href = f'<a href="data:file/csv;base64,{b64}" download="prediction.csv">Download Predictions</a>'
    return href

In [5]:
# Model building
def build_model(input_data):
    # Reads in saved regression model
    load_model = pickle.load(open('dardarin_model.pkl', 'rb'))
    # Apply model to make predictions
    prediction = load_model.predict(input_data)
    st.header('**Prediction output**')
    prediction_output = pd.Series(prediction, name='pIC50')
    molecule_name = pd.Series(load_data[1], name='molecule_name')
    df = pd.concat([molecule_name, prediction_output], axis=1)
    st.write(df)
    st.markdown(filedownload(df), unsafe_allow_html=True)

In [6]:
# Logo image
image = Image.open('logo.png')

In [7]:
st.image(image, use_column_width=True)

2023-04-03 20:08:06.111 
  command:

    streamlit run /Users/mahimasrivastava/opt/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

In [8]:
# Page title
st.markdown("""
# Bioactivity Prediction App (Dardarin)
This app allows you to predict the bioactivity towards inhibting the `Leucine-rich repeat serine/threonine-protein kinase 2` also known as Dardarin enzyme. `Dardarin` is a drug target for Parkinsons's disease.
**Credits**
- App built in `Python` + `Streamlit` by [Mahima Srivastava](https://github.com/mahimasrivastava16/dardarin_project, https://medium.com/@mahimas1621/computational-drug-design-using-machine-learning-leucine-rich-repeat-serine-threonine-protein-c546efca4c4b))
- Descriptor calculated using [PaDEL-Descriptor](http://www.yapcwsoft.com/dd/padeldescriptor/) [[Read the Paper]](https://doi.org/10.1002/jcc.21707).
---
""")


DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

In [20]:
# Sidebar
with st.sidebar.header('1. Upload your CSV data'):
    uploaded_file = st.sidebar.file_uploader("Upload your input file", type=['txt'])
    st.sidebar.markdown("""
[Example input file](https://raw.githubusercontent.com/dataprofessor/bioactivity-prediction-app/main/example_acetylcholinesterase.txt)
""")

if st.sidebar.button('Predict'):
    load_data = pd.read_table(uploaded_file, sep=' ', header=None)
    load_data.to_csv('molecule.smi', sep = '\t', header = False, index = False)

    st.header('**Original input data**')
    st.write(load_data)

    with st.spinner("Calculating descriptors..."):
        desc_calc()

    # Read in calculated descriptors and display the dataframe
    st.header('**Calculated molecular descriptors**')
    desc = pd.read_csv('descriptors_output.csv')
    st.write(desc)
    st.write(desc.shape)

    # Read descriptor list used in previously built model
    st.header('**Subset of descriptors from previously built models**')
    Xlist = list(pd.read_csv('descriptor_list.csv').columns)
    desc_subset = desc[Xlist]
    st.write(desc_subset)
    st.write(desc_subset.shape)

    # Apply trained model to make prediction on query compounds
    build_model(desc_subset)
else:
    st.info('Upload input data in the sidebar to start!')